From 2b928eedd4bf88b2d84efe58c8cbdb5b7d1390bc Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Tue, 2 Apr 2024 19:11:53 +0100 Subject: [PATCH 01/25] backup wip --- lerobot/common/datasets/factory.py | 26 ++++ lerobot/common/policies/act/detr_vae.py | 81 ++++++---- lerobot/common/policies/act/policy.py | 40 ++++- lerobot/common/policies/factory.py | 2 +- lerobot/configs/policy/act.yaml | 3 +- lerobot/scripts/train.py | 1 + poetry.lock | 197 +++++++++++++++++++++++- pyproject.toml | 1 + 8 files changed, 314 insertions(+), 37 deletions(-) diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py index 04077034..47a15ea4 100644 --- a/lerobot/common/datasets/factory.py +++ b/lerobot/common/datasets/factory.py @@ -125,6 +125,32 @@ def make_offline_buffer( # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max" + + # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human. + # (Pdb) stats['observation']['state']['mean'] + # tensor([-0.0071, -0.6293, 1.0351, -0.0517, -0.4642, -0.0754, 0.4751, -0.0373, + # -0.3324, 0.9034, -0.2258, -0.3127, -0.2412, 0.6866]) + stats['observation', 'state', 'mean'] = torch.tensor([-0.00740268, -0.63187766, 1.0356655 , -0.05027218, -0.46199223, + -0.07467502, 0.47467607, -0.03615446, -0.33203387, 0.9038929 , + -0.22060776, -0.31011587, -0.23484458, 0.6842416 ]) + # (Pdb) stats['observation']['state']['std'] + # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494, + # 0.0326, 0.0476, 0.0535, 0.0956, 0.0513]) + stats['observation', 'state', 'std'] = torch.tensor([0.01219023, 0.2975381 , 0.16728032, 0.04733803, 0.1486037 , + 0.08788499, 0.31752336, 0.1049916 , 0.27933604, 0.18094037, + 0.26604933, 0.30466506, 0.5298686 , 0.25505227]) + # (Pdb) stats['action']['mean'] + # tensor([-0.0075, -0.6346, 1.0353, -0.0465, -0.4686, -0.0738, 0.3723, -0.0396, + # -0.3184, 0.8991, -0.2065, -0.3182, -0.2338, 0.5593]) + stats['action']['mean'] = torch.tensor([-0.00756444, -0.6281845 , 1.0312834 , -0.04664314, -0.47211358, + -0.074527 , 0.37389806, -0.03718753, -0.3261143 , 0.8997205 , + -0.21371077, -0.31840396, -0.23360962, 0.551947]) + # (Pdb) stats['action']['std'] + # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510, + # 0.0328, 0.0478, 0.0531, 0.0945, 0.0794]) + stats['action']['std'] = torch.tensor([0.01252818, 0.2957442 , 0.16701928, 0.04584508, 0.14833844, + 0.08763024, 0.30665937, 0.10600077, 0.27572668, 0.1805853 , + 0.26304692, 0.30708534, 0.5305411 , 0.38381037]) transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode)) offline_buffer.set_transform(transforms) diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py index 0f2626f7..4d5525f2 100644 --- a/lerobot/common/policies/act/detr_vae.py +++ b/lerobot/common/policies/act/detr_vae.py @@ -2,6 +2,7 @@ import numpy as np import torch from torch import nn from torch.autograd import Variable +from transformers import DetrForObjectDetection from .backbone import build_backbone from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer @@ -24,31 +25,57 @@ def get_sinusoid_encoding_table(n_position, d_hid): return torch.FloatTensor(sinusoid_table).unsqueeze(0) -class DETRVAE(nn.Module): - """This is the DETR module that performs object detection""" +class ActionChunkingTransformer(nn.Module): + """ + Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware + (https://arxiv.org/abs/2304.13705). + + Note: In this code we use the symbols `vae_encoder`, 'encoder', `decoder`. The meanings are as follows. + - The `vae_encoder` is, as per the literature around conditional variational auto-encoders (cVAE), the + part of the model that encodes the target data (here, a sequence of actions), and the condition + (here, we include the robot joint-space state as an input to the encoder). + - The `transformer` is the cVAE's decoder. But since we have an option to train this model without the + variational objective (in which case we drop the `vae_encoder` altogether), we don't call it the + `vae_decoder`. + # TODO(now): remove the following + - The `encoder` is actually a component of the cVAE's "decoder". But we refer to it as an "encoder" + because, in terms of the transformer with cross-attention that forms the cVAE's decoder, it is the + "encoder" part. We drop the `vae_` prefix because we have an option to train this model without the + variational objective (in which case we drop the `vae_encoder` altogether), and nothing about this + model has anything to do with a VAE). + - The `decoder` is a building block of the VAE decoder, and is just the "decoder" part of a + transformer with cross-attention. For the same reasoning behind the naming of `encoder`, we make + this term agnostic to the option to use a variational objective for training. + + """ def __init__( - self, backbones, transformer, encoder, state_dim, action_dim, num_queries, camera_names, vae + self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, vae ): """Initializes the model. Parameters: backbones: torch module of the backbone to be used. See backbone.py transformer: torch module of the transformer architecture. See transformer.py state_dim: robot state dimension of the environment - num_queries: number of object queries, ie detection slot. This is the maximal number of objects + horizon: number of object queries, ie detection slot. This is the maximal number of objects DETR can detect in a single image. For COCO, we recommend 100 queries. - aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + + Args: + state_dim: Robot positional state dimension. + action_dim: Action dimension. + horizon: The number of actions to generate in one forward pass. + vae: Whether to use the variational objective. TODO(now): Give more details. """ super().__init__() - self.num_queries = num_queries self.camera_names = camera_names self.transformer = transformer - self.encoder = encoder + self.vae_encoder = vae_encoder self.vae = vae hidden_dim = transformer.d_model self.action_head = nn.Linear(hidden_dim, action_dim) self.is_pad_head = nn.Linear(hidden_dim, 1) - self.query_embed = nn.Embedding(num_queries, hidden_dim) + # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the + self.pos_embed = nn.Embedding(horizon, hidden_dim) if backbones is not None: self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1) self.backbones = nn.ModuleList(backbones) @@ -61,16 +88,16 @@ class DETRVAE(nn.Module): self.pos = torch.nn.Embedding(2, hidden_dim) self.backbones = None - # encoder extra parameters + # vae_encoder extra parameters self.latent_dim = 32 # final size of latent z # TODO tune self.cls_embed = nn.Embedding(1, hidden_dim) # extra cls token embedding - self.encoder_action_proj = nn.Linear(14, hidden_dim) # project action to embedding - self.encoder_joint_proj = nn.Linear(14, hidden_dim) # project qpos to embedding + self.vae_encoder_action_proj = nn.Linear(14, hidden_dim) # project action to embedding + self.vae_encoder_joint_proj = nn.Linear(14, hidden_dim) # project qpos to embedding self.latent_proj = nn.Linear( hidden_dim, self.latent_dim * 2 ) # project hidden state to latent std, var self.register_buffer( - "pos_table", get_sinusoid_encoding_table(1 + 1 + num_queries, hidden_dim) + "pos_table", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim) ) # [CLS], qpos, a_seq # decoder extra parameters @@ -91,15 +118,15 @@ class DETRVAE(nn.Module): ### Obtain latent z from action sequence if self.vae and is_training: # project action sequence to embedding dim, and concat with a CLS token - action_embed = self.encoder_action_proj(actions) # (bs, seq, hidden_dim) - qpos_embed = self.encoder_joint_proj(qpos) # (bs, hidden_dim) + action_embed = self.vae_encoder_action_proj(actions) # (bs, seq, hidden_dim) + qpos_embed = self.vae_encoder_joint_proj(qpos) # (bs, hidden_dim) qpos_embed = torch.unsqueeze(qpos_embed, axis=1) # (bs, 1, hidden_dim) cls_embed = self.cls_embed.weight # (1, hidden_dim) cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1) # (bs, 1, hidden_dim) - encoder_input = torch.cat( + vae_encoder_input = torch.cat( [cls_embed, qpos_embed, action_embed], axis=1 ) # (bs, seq+1, hidden_dim) - encoder_input = encoder_input.permute(1, 0, 2) # (seq+1, bs, hidden_dim) + vae_encoder_input = vae_encoder_input.permute(1, 0, 2) # (seq+1, bs, hidden_dim) # do not mask cls token # cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device) # False: not a padding # is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1) # (bs, seq+1) @@ -107,9 +134,9 @@ class DETRVAE(nn.Module): pos_embed = self.pos_table.clone().detach() pos_embed = pos_embed.permute(1, 0, 2) # (seq+1, 1, hidden_dim) # query model - encoder_output = self.encoder(encoder_input, pos=pos_embed) # , src_key_padding_mask=is_pad) - encoder_output = encoder_output[0] # take cls output only - latent_info = self.latent_proj(encoder_output) + vae_encoder_output = self.vae_encoder(vae_encoder_input, pos=pos_embed) # , src_key_padding_mask=is_pad) + vae_encoder_output = vae_encoder_output[0] # take cls output only + latent_info = self.latent_proj(vae_encoder_output) mu = latent_info[:, : self.latent_dim] logvar = latent_info[:, self.latent_dim :] latent_sample = reparametrize(mu, logvar) @@ -137,7 +164,7 @@ class DETRVAE(nn.Module): hs = self.transformer( src, None, - self.query_embed.weight, + self.pos_embed.weight, pos, latent_input, proprio_input, @@ -147,7 +174,7 @@ class DETRVAE(nn.Module): qpos = self.input_proj_robot_state(qpos) env_state = self.input_proj_env_state(env_state) transformer_input = torch.cat([qpos, env_state], axis=1) # seq length = 2 - hs = self.transformer(transformer_input, None, self.query_embed.weight, self.pos.weight)[0] + hs = self.transformer(transformer_input, None, self.pos_embed.weight, self.pos.weight)[0] a_hat = self.action_head(hs) is_pad_hat = self.is_pad_head(hs) return a_hat, is_pad_hat, [mu, logvar] @@ -165,7 +192,7 @@ def mlp(input_dim, hidden_dim, output_dim, hidden_depth): return trunk -def build_encoder(args): +def build_vae_encoder(args): d_model = args.hidden_dim # 256 dropout = args.dropout # 0.1 nhead = args.nheads # 8 @@ -192,16 +219,16 @@ def build(args): backbones.append(backbone) transformer = build_transformer(args) + + vae_encoder = build_vae_encoder(args) - encoder = build_encoder(args) - - model = DETRVAE( + model = ActionChunkingTransformer( backbones, transformer, - encoder, + vae_encoder, state_dim=args.state_dim, action_dim=args.action_dim, - num_queries=args.num_queries, + horizon=args.num_queries, camera_names=args.camera_names, vae=args.vae, ) diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index ae4f7320..a88f7640 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -42,9 +42,28 @@ def kl_divergence(mu, logvar): class ActionChunkingTransformerPolicy(AbstractPolicy): + """ + Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware + (https://arxiv.org/abs/2304.13705). + """ + name = "act" def __init__(self, cfg, device, n_action_steps=1): + """ + Args: + vae: Whether to use the variational objective. TODO(now): Give more details. + temporal_agg: Whether to do temporal aggregation. For each timestep during rollout, the action + returned as an exponential moving average of previously generated actions for that timestep. + n_obs_steps: Number of time steps worth of observation to use as input. + horizon: The number of actions to generate in one forward pass. + kl_weight: Weight for KL divergence. Defaults to None. Only applicable when using the variational + objective. + batch_size: Training batch size. + grad_clip_norm: Optionally clip the gradients to have this value as the norm at most. Defaults to + None meaning gradient clipping is not applied. + lr: Learning rate. + """ super().__init__(n_action_steps) self.cfg = cfg self.n_action_steps = n_action_steps @@ -57,8 +76,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): def update(self, replay_buffer, step): del step - start_time = time.time() - self.train() num_slices = self.cfg.batch_size @@ -103,11 +120,14 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): "action": action.to(self.device, non_blocking=True), } return out + + start_time = time.time() batch = replay_buffer.sample(batch_size) batch = process_batch(batch, self.cfg.horizon, num_slices) data_s = time.time() - start_time + print(data_s) loss = self.compute_loss(batch) loss.backward() @@ -151,9 +171,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): @torch.no_grad() def select_actions(self, observation, step_count): - if observation["image"].shape[0] != 1: - raise NotImplementedError("Batch size > 1 not handled") - # TODO(rcadene): remove unused step_count del step_count @@ -167,7 +184,17 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): "image": observation["image", "top"], "agent_pos": observation["state"], } - action = self._forward(qpos=obs_dict["agent_pos"], image=obs_dict["image"]) + # qpos = obs_dict["agent_pos"] + # img = obs_dict["image"] + # qpos_ = torch.load('/tmp/qpos.pth') + # img_ = torch.load('/tmp/curr_image.pth') + # out_ = torch.load('/tmp/out.pth') + # import cv2, numpy as np + # cv2.imwrite("ours.png", (obs_dict["image"][0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)) + # cv2.imwrite("theirs.png", (img_[0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)) + # out = self._forward(qpos_, img_) + # breakpoint() + action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"]) if self.cfg.temporal_agg: # TODO(rcadene): implement temporal aggregation @@ -197,6 +224,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): if is_pad is not None: is_pad = is_pad[:, : self.model.num_queries] + breakpoint() a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad) all_l1 = F.l1_loss(actions, a_hat, reduction="none") diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py index 934f0962..577ccf75 100644 --- a/lerobot/common/policies/factory.py +++ b/lerobot/common/policies/factory.py @@ -1,5 +1,5 @@ def make_policy(cfg): - if cfg.policy.name != "diffusion" and cfg.rollout_batch_size > 1: + if cfg.policy.name not in ["diffusion", "act"] and cfg.rollout_batch_size > 1: raise NotImplementedError("Only diffusion policy supports rollout_batch_size > 1 for the time being.") if cfg.policy.name == "tdmpc": diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index a52c3f54..0244944b 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -1,6 +1,6 @@ # @package _global_ -offline_steps: 1344000 +offline_steps: 2000 online_steps: 0 eval_episodes: 1 @@ -24,7 +24,6 @@ policy: weight_decay: 1e-4 grad_clip_norm: 10 backbone: resnet18 - num_queries: ${horizon} # chunk_size horizon: ${horizon} # chunk_size kl_weight: 10 hidden_dim: 512 diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index 18c3715b..454adf1a 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -151,6 +151,7 @@ def train(cfg: dict, out_dir=None, job_name=None): logging.info("make_policy") policy = make_policy(cfg) + policy.save("act.pt") num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) num_total_params = sum(p.numel() for p in policy.parameters()) diff --git a/poetry.lock b/poetry.lock index 72397001..9766051c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3248,6 +3248,133 @@ numpy = "*" [package.extras] all = ["defusedxml", "fsspec", "imagecodecs (>=2023.8.12)", "lxml", "matplotlib", "zarr"] +[[package]] +name = "tokenizers" +version = "0.15.2" +description = "" +optional = false +python-versions = ">=3.7" +files = [ + {file = "tokenizers-0.15.2-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:52f6130c9cbf70544287575a985bf44ae1bda2da7e8c24e97716080593638012"}, + {file = "tokenizers-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:054c1cc9c6d68f7ffa4e810b3d5131e0ba511b6e4be34157aa08ee54c2f8d9ee"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a9b9b070fdad06e347563b88c278995735292ded1132f8657084989a4c84a6d5"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea621a7eef4b70e1f7a4e84dd989ae3f0eeb50fc8690254eacc08acb623e82f1"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cf7fd9a5141634fa3aa8d6b7be362e6ae1b4cda60da81388fa533e0b552c98fd"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44f2a832cd0825295f7179eaf173381dc45230f9227ec4b44378322d900447c9"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b9ec69247a23747669ec4b0ca10f8e3dfb3545d550258129bd62291aabe8605"}, + {file = "tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:40b6a4c78da863ff26dbd5ad9a8ecc33d8a8d97b535172601cf00aee9d7ce9ce"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:5ab2a4d21dcf76af60e05af8063138849eb1d6553a0d059f6534357bce8ba364"}, + {file = "tokenizers-0.15.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a47acfac7e511f6bbfcf2d3fb8c26979c780a91e06fb5b9a43831b2c0153d024"}, + {file = "tokenizers-0.15.2-cp310-none-win32.whl", hash = "sha256:064ff87bb6acdbd693666de9a4b692add41308a2c0ec0770d6385737117215f2"}, + {file = "tokenizers-0.15.2-cp310-none-win_amd64.whl", hash = "sha256:3b919afe4df7eb6ac7cafd2bd14fb507d3f408db7a68c43117f579c984a73843"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:89cd1cb93e4b12ff39bb2d626ad77e35209de9309a71e4d3d4672667b4b256e7"}, + {file = "tokenizers-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cfed5c64e5be23d7ee0f0e98081a25c2a46b0b77ce99a4f0605b1ec43dd481fa"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a907d76dcfda37023ba203ab4ceeb21bc5683436ebefbd895a0841fd52f6f6f2"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20ea60479de6fc7b8ae756b4b097572372d7e4032e2521c1bbf3d90c90a99ff0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:48e2b9335be2bc0171df9281385c2ed06a15f5cf121c44094338306ab7b33f2c"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:112a1dd436d2cc06e6ffdc0b06d55ac019a35a63afd26475205cb4b1bf0bfbff"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4620cca5c2817177ee8706f860364cc3a8845bc1e291aaf661fb899e5d1c45b0"}, + {file = "tokenizers-0.15.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ccd73a82751c523b3fc31ff8194702e4af4db21dc20e55b30ecc2079c5d43cb7"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:107089f135b4ae7817affe6264f8c7a5c5b4fd9a90f9439ed495f54fcea56fb4"}, + {file = "tokenizers-0.15.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0ff110ecc57b7aa4a594396525a3451ad70988e517237fe91c540997c4e50e29"}, + {file = "tokenizers-0.15.2-cp311-none-win32.whl", hash = "sha256:6d76f00f5c32da36c61f41c58346a4fa7f0a61be02f4301fd30ad59834977cc3"}, + {file = "tokenizers-0.15.2-cp311-none-win_amd64.whl", hash = "sha256:cc90102ed17271cf0a1262babe5939e0134b3890345d11a19c3145184b706055"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f86593c18d2e6248e72fb91c77d413a815153b8ea4e31f7cd443bdf28e467670"}, + {file = "tokenizers-0.15.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0774bccc6608eca23eb9d620196687c8b2360624619623cf4ba9dc9bd53e8b51"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d0222c5b7c9b26c0b4822a82f6a7011de0a9d3060e1da176f66274b70f846b98"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3835738be1de66624fff2f4f6f6684775da4e9c00bde053be7564cbf3545cc66"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0143e7d9dcd811855c1ce1ab9bf5d96d29bf5e528fd6c7824d0465741e8c10fd"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db35825f6d54215f6b6009a7ff3eedee0848c99a6271c870d2826fbbedf31a38"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3f5e64b0389a2be47091d8cc53c87859783b837ea1a06edd9d8e04004df55a5c"}, + {file = "tokenizers-0.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e0480c452217edd35eca56fafe2029fb4d368b7c0475f8dfa3c5c9c400a7456"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:a33ab881c8fe70474980577e033d0bc9a27b7ab8272896e500708b212995d834"}, + {file = "tokenizers-0.15.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a308a607ca9de2c64c1b9ba79ec9a403969715a1b8ba5f998a676826f1a7039d"}, + {file = "tokenizers-0.15.2-cp312-none-win32.whl", hash = "sha256:b8fcfa81bcb9447df582c5bc96a031e6df4da2a774b8080d4f02c0c16b42be0b"}, + {file = "tokenizers-0.15.2-cp312-none-win_amd64.whl", hash = "sha256:38d7ab43c6825abfc0b661d95f39c7f8af2449364f01d331f3b51c94dcff7221"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:38bfb0204ff3246ca4d5e726e8cc8403bfc931090151e6eede54d0e0cf162ef0"}, + {file = "tokenizers-0.15.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:9c861d35e8286a53e06e9e28d030b5a05bcbf5ac9d7229e561e53c352a85b1fc"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:936bf3842db5b2048eaa53dade907b1160f318e7c90c74bfab86f1e47720bdd6"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:620beacc3373277700d0e27718aa8b25f7b383eb8001fba94ee00aeea1459d89"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2735ecbbf37e52db4ea970e539fd2d450d213517b77745114f92867f3fc246eb"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:473c83c5e2359bb81b0b6fde870b41b2764fcdd36d997485e07e72cc3a62264a"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:968fa1fb3c27398b28a4eca1cbd1e19355c4d3a6007f7398d48826bbe3a0f728"}, + {file = "tokenizers-0.15.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:865c60ae6eaebdde7da66191ee9b7db52e542ed8ee9d2c653b6d190a9351b980"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7c0d8b52664ab2d4a8d6686eb5effc68b78608a9008f086a122a7b2996befbab"}, + {file = "tokenizers-0.15.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:f33dfbdec3784093a9aebb3680d1f91336c56d86cc70ddf88708251da1fe9064"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:d44ba80988ff9424e33e0a49445072ac7029d8c0e1601ad25a0ca5f41ed0c1d6"}, + {file = "tokenizers-0.15.2-cp37-cp37m-macosx_11_0_arm64.whl", hash = "sha256:dce74266919b892f82b1b86025a613956ea0ea62a4843d4c4237be2c5498ed3a"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0ef06b9707baeb98b316577acb04f4852239d856b93e9ec3a299622f6084e4be"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c73e2e74bbb07910da0d37c326869f34113137b23eadad3fc00856e6b3d9930c"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4eeb12daf02a59e29f578a865f55d87cd103ce62bd8a3a5874f8fdeaa82e336b"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9ba9f6895af58487ca4f54e8a664a322f16c26bbb442effd01087eba391a719e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccec77aa7150e38eec6878a493bf8c263ff1fa8a62404e16c6203c64c1f16a26"}, + {file = "tokenizers-0.15.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3f40604f5042ff210ba82743dda2b6aa3e55aa12df4e9f2378ee01a17e2855e"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:5645938a42d78c4885086767c70923abad047163d809c16da75d6b290cb30bbe"}, + {file = "tokenizers-0.15.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:05a77cbfebe28a61ab5c3891f9939cc24798b63fa236d84e5f29f3a85a200c00"}, + {file = "tokenizers-0.15.2-cp37-none-win32.whl", hash = "sha256:361abdc068e8afe9c5b818769a48624687fb6aaed49636ee39bec4e95e1a215b"}, + {file = "tokenizers-0.15.2-cp37-none-win_amd64.whl", hash = "sha256:7ef789f83eb0f9baeb4d09a86cd639c0a5518528f9992f38b28e819df397eb06"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:4fe1f74a902bee74a3b25aff180fbfbf4f8b444ab37c4d496af7afd13a784ed2"}, + {file = "tokenizers-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4c4b89038a684f40a6b15d6b09f49650ac64d951ad0f2a3ea9169687bbf2a8ba"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d05a1b06f986d41aed5f2de464c003004b2df8aaf66f2b7628254bcbfb72a438"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508711a108684111ec8af89d3a9e9e08755247eda27d0ba5e3c50e9da1600f6d"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:daa348f02d15160cb35439098ac96e3a53bacf35885072611cd9e5be7d333daa"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:494fdbe5932d3416de2a85fc2470b797e6f3226c12845cadf054dd906afd0442"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c2d60f5246f4da9373f75ff18d64c69cbf60c3bca597290cea01059c336d2470"}, + {file = "tokenizers-0.15.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93268e788825f52de4c7bdcb6ebc1fcd4a5442c02e730faa9b6b08f23ead0e24"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6fc7083ab404019fc9acafe78662c192673c1e696bd598d16dc005bd663a5cf9"}, + {file = "tokenizers-0.15.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:41e39b41e5531d6b2122a77532dbea60e171ef87a3820b5a3888daa847df4153"}, + {file = "tokenizers-0.15.2-cp38-none-win32.whl", hash = "sha256:06cd0487b1cbfabefb2cc52fbd6b1f8d4c37799bd6c6e1641281adaa6b2504a7"}, + {file = "tokenizers-0.15.2-cp38-none-win_amd64.whl", hash = "sha256:5179c271aa5de9c71712e31cb5a79e436ecd0d7532a408fa42a8dbfa4bc23fd9"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:82f8652a74cc107052328b87ea8b34291c0f55b96d8fb261b3880216a9f9e48e"}, + {file = "tokenizers-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:02458bee6f5f3139f1ebbb6d042b283af712c0981f5bc50edf771d6b762d5e4f"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:c9a09cd26cca2e1c349f91aa665309ddb48d71636370749414fbf67bc83c5343"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:158be8ea8554e5ed69acc1ce3fbb23a06060bd4bbb09029431ad6b9a466a7121"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ddba9a2b0c8c81633eca0bb2e1aa5b3a15362b1277f1ae64176d0f6eba78ab1"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3ef5dd1d39797044642dbe53eb2bc56435308432e9c7907728da74c69ee2adca"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:454c203164e07a860dbeb3b1f4a733be52b0edbb4dd2e5bd75023ffa8b49403a"}, + {file = "tokenizers-0.15.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0cf6b7f1d4dc59af960e6ffdc4faffe6460bbfa8dce27a58bf75755ffdb2526d"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:2ef09bbc16519f6c25d0c7fc0c6a33a6f62923e263c9d7cca4e58b8c61572afb"}, + {file = "tokenizers-0.15.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c9a2ebdd2ad4ec7a68e7615086e633857c85e2f18025bd05d2a4399e6c5f7169"}, + {file = "tokenizers-0.15.2-cp39-none-win32.whl", hash = "sha256:918fbb0eab96fe08e72a8c2b5461e9cce95585d82a58688e7f01c2bd546c79d0"}, + {file = "tokenizers-0.15.2-cp39-none-win_amd64.whl", hash = "sha256:524e60da0135e106b254bd71f0659be9f89d83f006ea9093ce4d1fab498c6d0d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:6a9b648a58281c4672212fab04e60648fde574877d0139cd4b4f93fe28ca8944"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:7c7d18b733be6bbca8a55084027f7be428c947ddf871c500ee603e375013ffba"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:13ca3611de8d9ddfbc4dc39ef54ab1d2d4aaa114ac8727dfdc6a6ec4be017378"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:237d1bf3361cf2e6463e6c140628e6406766e8b27274f5fcc62c747ae3c6f094"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67a0fe1e49e60c664915e9fb6b0cb19bac082ab1f309188230e4b2920230edb3"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:4e022fe65e99230b8fd89ebdfea138c24421f91c1a4f4781a8f5016fd5cdfb4d"}, + {file = "tokenizers-0.15.2-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:d857be2df69763362ac699f8b251a8cd3fac9d21893de129bc788f8baaef2693"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:708bb3e4283177236309e698da5fcd0879ce8fd37457d7c266d16b550bcbbd18"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c35e09e9899b72a76e762f9854e8750213f67567787d45f37ce06daf57ca78"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1257f4394be0d3b00de8c9e840ca5601d0a4a8438361ce9c2b05c7d25f6057b"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02272fe48280e0293a04245ca5d919b2c94a48b408b55e858feae9618138aeda"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:dc3ad9ebc76eabe8b1d7c04d38be884b8f9d60c0cdc09b0aa4e3bcf746de0388"}, + {file = "tokenizers-0.15.2-pp37-pypy37_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:32e16bdeffa7c4f46bf2152172ca511808b952701d13e7c18833c0b73cb5c23f"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:fb16ba563d59003028b678d2361a27f7e4ae0ab29c7a80690efa20d829c81fdb"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:2277c36d2d6cdb7876c274547921a42425b6810d38354327dd65a8009acf870c"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:1cf75d32e8d250781940d07f7eece253f2fe9ecdb1dc7ba6e3833fa17b82fcbc"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1b3b31884dc8e9b21508bb76da80ebf7308fdb947a17affce815665d5c4d028"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b10122d8d8e30afb43bb1fe21a3619f62c3e2574bff2699cf8af8b0b6c5dc4a3"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:d88b96ff0fe8e91f6ef01ba50b0d71db5017fa4e3b1d99681cec89a85faf7bf7"}, + {file = "tokenizers-0.15.2-pp38-pypy38_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:37aaec5a52e959892870a7c47cef80c53797c0db9149d458460f4f31e2fb250e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:e2ea752f2b0fe96eb6e2f3adbbf4d72aaa1272079b0dfa1145507bd6a5d537e6"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:4b19a808d8799fda23504a5cd31d2f58e6f52f140380082b352f877017d6342b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:64c86e5e068ac8b19204419ed8ca90f9d25db20578f5881e337d203b314f4104"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:de19c4dc503c612847edf833c82e9f73cd79926a384af9d801dcf93f110cea4e"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea09acd2fe3324174063d61ad620dec3bcf042b495515f27f638270a7d466e8b"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:cf27fd43472e07b57cf420eee1e814549203d56de00b5af8659cb99885472f1f"}, + {file = "tokenizers-0.15.2-pp39-pypy39_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:7ca22bd897537a0080521445d91a58886c8c04084a6a19e6c78c586e0cfa92a5"}, + {file = "tokenizers-0.15.2.tar.gz", hash = "sha256:e6e9c6e019dd5484be5beafc775ae6c925f4c69a3487040ed09b45e13df2cb91"}, +] + +[package.dependencies] +huggingface_hub = ">=0.16.4,<1.0" + +[package.extras] +dev = ["tokenizers[testing]"] +docs = ["setuptools_rust", "sphinx", "sphinx_rtd_theme"] +testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] + [[package]] name = "tomli" version = "2.0.1" @@ -3413,6 +3540,74 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] +[[package]] +name = "transformers" +version = "4.39.3" +description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "transformers-4.39.3-py3-none-any.whl", hash = "sha256:7838034a12cca3168247f9d2d1dba6724c9de3ae0f73a108258c6b8fc5912601"}, + {file = "transformers-4.39.3.tar.gz", hash = "sha256:2586e5ff4150f122716fc40f5530e92871befc051848fbe82600969c535b762d"}, +] + +[package.dependencies] +filelock = "*" +huggingface-hub = ">=0.19.3,<1.0" +numpy = ">=1.17" +packaging = ">=20.0" +pyyaml = ">=5.1" +regex = "!=2019.12.17" +requests = "*" +safetensors = ">=0.4.1" +tokenizers = ">=0.14,<0.19" +tqdm = ">=4.27" + +[package.extras] +accelerate = ["accelerate (>=0.21.0)"] +agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision"] +audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] +codecarbon = ["codecarbon (==1.2.0)"] +deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.14,<0.19)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +docs = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch", "torchaudio", "torchvision"] +docs-specific = ["hf-doc-builder"] +flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"] +flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] +ftfy = ["ftfy"] +integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] +modelcreation = ["cookiecutter (==1.7.3)"] +natten = ["natten (>=0.14.6,<0.15.0)"] +onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"] +onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] +optuna = ["optuna"] +quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"] +ray = ["ray[tune] (>=2.7.0)"] +retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] +sagemaker = ["sagemaker (>=2.31.0)"] +sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] +serving = ["fastapi", "pydantic", "starlette", "uvicorn"] +sigopt = ["sigopt"] +sklearn = ["scikit-learn"] +speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pydantic", "pytest (>=7.2.0,<8.0.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "tensorboard", "timeout-decorator"] +tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] +tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] +tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] +timm = ["timm"] +tokenizers = ["tokenizers (>=0.14,<0.19)"] +torch = ["accelerate (>=0.21.0)", "torch"] +torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] +torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] +torchhub = ["filelock", "huggingface-hub (>=0.19.3,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.14,<0.19)", "torch", "tqdm (>=4.27)"] +video = ["av (==9.2.0)", "decord (==0.6.0)"] +vision = ["Pillow (>=10.0.1,<=15.0)"] + [[package]] name = "triton" version = "2.2.0" @@ -3589,4 +3784,4 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "174c7d42f8039eedd2c447a4e6cae5169782cbd94346b5606572a0010194ca05" +content-hash = "5ebd02dac0322efe1236eb9fec84c471edd0c5373cc8967b1982314164b3bf50" diff --git a/pyproject.toml b/pyproject.toml index 972c1b61..b2526e5c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ robomimic = "0.2.0" gymnasium-robotics = "^1.2.4" gymnasium = "^0.29.1" cmake = "^3.29.0.1" +transformers = "^4.39.3" [tool.poetry.group.dev.dependencies] From 65ef8c30d03fd5c8904f2f914870447c712387a9 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Tue, 2 Apr 2024 19:13:49 +0100 Subject: [PATCH 02/25] backup wip --- lerobot/common/datasets/factory.py | 86 +++++++++++++++++++++---- lerobot/common/policies/act/detr_vae.py | 9 +-- lerobot/common/policies/act/policy.py | 4 +- 3 files changed, 80 insertions(+), 19 deletions(-) diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py index 47a15ea4..b394e830 100644 --- a/lerobot/common/datasets/factory.py +++ b/lerobot/common/datasets/factory.py @@ -125,32 +125,92 @@ def make_offline_buffer( # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max" - + # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human. # (Pdb) stats['observation']['state']['mean'] # tensor([-0.0071, -0.6293, 1.0351, -0.0517, -0.4642, -0.0754, 0.4751, -0.0373, # -0.3324, 0.9034, -0.2258, -0.3127, -0.2412, 0.6866]) - stats['observation', 'state', 'mean'] = torch.tensor([-0.00740268, -0.63187766, 1.0356655 , -0.05027218, -0.46199223, - -0.07467502, 0.47467607, -0.03615446, -0.33203387, 0.9038929 , - -0.22060776, -0.31011587, -0.23484458, 0.6842416 ]) + stats["observation", "state", "mean"] = torch.tensor( + [ + -0.00740268, + -0.63187766, + 1.0356655, + -0.05027218, + -0.46199223, + -0.07467502, + 0.47467607, + -0.03615446, + -0.33203387, + 0.9038929, + -0.22060776, + -0.31011587, + -0.23484458, + 0.6842416, + ] + ) # (Pdb) stats['observation']['state']['std'] # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494, # 0.0326, 0.0476, 0.0535, 0.0956, 0.0513]) - stats['observation', 'state', 'std'] = torch.tensor([0.01219023, 0.2975381 , 0.16728032, 0.04733803, 0.1486037 , - 0.08788499, 0.31752336, 0.1049916 , 0.27933604, 0.18094037, - 0.26604933, 0.30466506, 0.5298686 , 0.25505227]) + stats["observation", "state", "std"] = torch.tensor( + [ + 0.01219023, + 0.2975381, + 0.16728032, + 0.04733803, + 0.1486037, + 0.08788499, + 0.31752336, + 0.1049916, + 0.27933604, + 0.18094037, + 0.26604933, + 0.30466506, + 0.5298686, + 0.25505227, + ] + ) # (Pdb) stats['action']['mean'] # tensor([-0.0075, -0.6346, 1.0353, -0.0465, -0.4686, -0.0738, 0.3723, -0.0396, # -0.3184, 0.8991, -0.2065, -0.3182, -0.2338, 0.5593]) - stats['action']['mean'] = torch.tensor([-0.00756444, -0.6281845 , 1.0312834 , -0.04664314, -0.47211358, - -0.074527 , 0.37389806, -0.03718753, -0.3261143 , 0.8997205 , - -0.21371077, -0.31840396, -0.23360962, 0.551947]) + stats["action"]["mean"] = torch.tensor( + [ + -0.00756444, + -0.6281845, + 1.0312834, + -0.04664314, + -0.47211358, + -0.074527, + 0.37389806, + -0.03718753, + -0.3261143, + 0.8997205, + -0.21371077, + -0.31840396, + -0.23360962, + 0.551947, + ] + ) # (Pdb) stats['action']['std'] # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510, # 0.0328, 0.0478, 0.0531, 0.0945, 0.0794]) - stats['action']['std'] = torch.tensor([0.01252818, 0.2957442 , 0.16701928, 0.04584508, 0.14833844, - 0.08763024, 0.30665937, 0.10600077, 0.27572668, 0.1805853 , - 0.26304692, 0.30708534, 0.5305411 , 0.38381037]) + stats["action"]["std"] = torch.tensor( + [ + 0.01252818, + 0.2957442, + 0.16701928, + 0.04584508, + 0.14833844, + 0.08763024, + 0.30665937, + 0.10600077, + 0.27572668, + 0.1805853, + 0.26304692, + 0.30708534, + 0.5305411, + 0.38381037, + ] + ) transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode)) offline_buffer.set_transform(transforms) diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py index 4d5525f2..f21308ad 100644 --- a/lerobot/common/policies/act/detr_vae.py +++ b/lerobot/common/policies/act/detr_vae.py @@ -2,7 +2,6 @@ import numpy as np import torch from torch import nn from torch.autograd import Variable -from transformers import DetrForObjectDetection from .backbone import build_backbone from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer @@ -74,7 +73,7 @@ class ActionChunkingTransformer(nn.Module): hidden_dim = transformer.d_model self.action_head = nn.Linear(hidden_dim, action_dim) self.is_pad_head = nn.Linear(hidden_dim, 1) - # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the + # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the self.pos_embed = nn.Embedding(horizon, hidden_dim) if backbones is not None: self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1) @@ -134,7 +133,9 @@ class ActionChunkingTransformer(nn.Module): pos_embed = self.pos_table.clone().detach() pos_embed = pos_embed.permute(1, 0, 2) # (seq+1, 1, hidden_dim) # query model - vae_encoder_output = self.vae_encoder(vae_encoder_input, pos=pos_embed) # , src_key_padding_mask=is_pad) + vae_encoder_output = self.vae_encoder( + vae_encoder_input, pos=pos_embed + ) # , src_key_padding_mask=is_pad) vae_encoder_output = vae_encoder_output[0] # take cls output only latent_info = self.latent_proj(vae_encoder_output) mu = latent_info[:, : self.latent_dim] @@ -219,7 +220,7 @@ def build(args): backbones.append(backbone) transformer = build_transformer(args) - + vae_encoder = build_vae_encoder(args) model = ActionChunkingTransformer( diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index a88f7640..5cf74ae5 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -54,7 +54,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): Args: vae: Whether to use the variational objective. TODO(now): Give more details. temporal_agg: Whether to do temporal aggregation. For each timestep during rollout, the action - returned as an exponential moving average of previously generated actions for that timestep. + returned as an exponential moving average of previously generated actions for that timestep. n_obs_steps: Number of time steps worth of observation to use as input. horizon: The number of actions to generate in one forward pass. kl_weight: Weight for KL divergence. Defaults to None. Only applicable when using the variational @@ -120,7 +120,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): "action": action.to(self.device, non_blocking=True), } return out - + start_time = time.time() batch = replay_buffer.sample(batch_size) From 110ac5ffa123c64eb61a313eb08638ed6efe84ee Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Wed, 3 Apr 2024 14:21:07 +0100 Subject: [PATCH 03/25] backup wip --- lerobot/common/envs/aloha/env.py | 1 - lerobot/common/policies/act/detr_vae.py | 216 ++++++++++----------- lerobot/common/policies/act/policy.py | 5 +- lerobot/common/policies/act/transformer.py | 85 ++------ lerobot/configs/policy/act.yaml | 2 +- scripts/convert_act_weights.py | 64 ++++++ 6 files changed, 182 insertions(+), 191 deletions(-) create mode 100644 scripts/convert_act_weights.py diff --git a/lerobot/common/envs/aloha/env.py b/lerobot/common/envs/aloha/env.py index 8f907650..ad8087d0 100644 --- a/lerobot/common/envs/aloha/env.py +++ b/lerobot/common/envs/aloha/env.py @@ -191,7 +191,6 @@ class AlohaEnv(AbstractEnv): { "observation": TensorDict(obs, batch_size=[]), "reward": torch.tensor([reward], dtype=torch.float32), - # success and done are true when coverage > self.success_threshold in env "done": torch.tensor([done], dtype=torch.bool), "success": torch.tensor([success], dtype=torch.bool), }, diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py index f21308ad..ff137a34 100644 --- a/lerobot/common/policies/act/detr_vae.py +++ b/lerobot/common/policies/act/detr_vae.py @@ -1,18 +1,12 @@ +import einops import numpy as np import torch from torch import nn -from torch.autograd import Variable from .backbone import build_backbone from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer -def reparametrize(mu, logvar): - std = logvar.div(2).exp() - eps = Variable(std.data.new(std.size()).normal_()) - return mu + std * eps - - def get_sinusoid_encoding_table(n_position, d_hid): def get_position_angle_vec(position): return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] @@ -27,7 +21,7 @@ def get_sinusoid_encoding_table(n_position, d_hid): class ActionChunkingTransformer(nn.Module): """ Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware - (https://arxiv.org/abs/2304.13705). + (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act) Note: In this code we use the symbols `vae_encoder`, 'encoder', `decoder`. The meanings are as follows. - The `vae_encoder` is, as per the literature around conditional variational auto-encoders (cVAE), the @@ -49,7 +43,7 @@ class ActionChunkingTransformer(nn.Module): """ def __init__( - self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, vae + self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, use_vae ): """Initializes the model. Parameters: @@ -63,134 +57,124 @@ class ActionChunkingTransformer(nn.Module): state_dim: Robot positional state dimension. action_dim: Action dimension. horizon: The number of actions to generate in one forward pass. - vae: Whether to use the variational objective. TODO(now): Give more details. + use_vae: Whether to use the variational objective. TODO(now): Give more details. """ super().__init__() + self.camera_names = camera_names self.transformer = transformer self.vae_encoder = vae_encoder - self.vae = vae + self.use_vae = use_vae hidden_dim = transformer.d_model - self.action_head = nn.Linear(hidden_dim, action_dim) - self.is_pad_head = nn.Linear(hidden_dim, 1) - # Positional embedding to be used as input to the latent vae_encoder (if applicable) and for the - self.pos_embed = nn.Embedding(horizon, hidden_dim) - if backbones is not None: - self.input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1) - self.backbones = nn.ModuleList(backbones) - self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim) - else: - # input_dim = 14 + 7 # robot_state + env_state - self.input_proj_robot_state = nn.Linear(state_dim, hidden_dim) - # TODO(rcadene): understand what is env_state, and why it needs to be 7 - self.input_proj_env_state = nn.Linear(state_dim // 2, hidden_dim) - self.pos = torch.nn.Embedding(2, hidden_dim) - self.backbones = None - # vae_encoder extra parameters - self.latent_dim = 32 # final size of latent z # TODO tune - self.cls_embed = nn.Embedding(1, hidden_dim) # extra cls token embedding - self.vae_encoder_action_proj = nn.Linear(14, hidden_dim) # project action to embedding - self.vae_encoder_joint_proj = nn.Linear(14, hidden_dim) # project qpos to embedding - self.latent_proj = nn.Linear( - hidden_dim, self.latent_dim * 2 - ) # project hidden state to latent std, var - self.register_buffer( - "pos_table", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim) - ) # [CLS], qpos, a_seq + # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence]. + # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]). + if use_vae: + self.cls_embed = nn.Embedding(1, hidden_dim) + # Projection layer for joint-space configuration to hidden dimension. + self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim) + # Projection layer for action (joint-space target) to hidden dimension. + self.vae_encoder_action_input_proj = nn.Linear(state_dim, hidden_dim) + # Final size of latent z. TODO(now): Add to hyperparams. + self.latent_dim = 32 + # Projection layer from the VAE encoder's output to the latent distribution's parameter space. + self.vae_encoder_latent_output_proj = nn.Linear(hidden_dim, self.latent_dim * 2) + # Fixed sinusoidal positional embedding the whole input to the VAE encoder. + self.register_buffer( + "vae_encoder_pos_enc", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim) + ) - # decoder extra parameters - self.latent_out_proj = nn.Linear(self.latent_dim, hidden_dim) # project latent sample to embedding + # Transformer encoder input projections. The tokens will be structured like + # [latent, robot_state, image_feature_map_pixels]. + self.backbones = nn.ModuleList(backbones) + self.encoder_img_feat_input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1) + self.encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim) + self.encoder_latent_input_proj = nn.Linear(self.latent_dim, hidden_dim) + # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image + # feature dimension with a dry run. self.additional_pos_embed = nn.Embedding( 2, hidden_dim ) # learned position embedding for proprio and latent - def forward(self, qpos, image, env_state, actions=None, is_pad=None): + # Transformer decoder. + # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries). + self.decoder_pos_embed = nn.Embedding(horizon, hidden_dim) + # Final action regression head on the output of the transformer's decoder. + self.action_head = nn.Linear(hidden_dim, action_dim) + + def forward(self, robot_state, image, actions=None): """ - qpos: batch, qpos_dim - image: batch, num_cam, channel, height, width - env_state: None - actions: batch, seq, action_dim + Args: + robot_state: (B, J) batch of robot joint configurations. + image: (B, N, C, H, W) batch of N camera frames. + actions: (B, S, A) batch of actions from the target dataset which must be provided if the + VAE is enabled and the model is in training mode. """ - is_training = actions is not None # train or val - bs, _ = qpos.shape - ### Obtain latent z from action sequence - if self.vae and is_training: - # project action sequence to embedding dim, and concat with a CLS token - action_embed = self.vae_encoder_action_proj(actions) # (bs, seq, hidden_dim) - qpos_embed = self.vae_encoder_joint_proj(qpos) # (bs, hidden_dim) - qpos_embed = torch.unsqueeze(qpos_embed, axis=1) # (bs, 1, hidden_dim) - cls_embed = self.cls_embed.weight # (1, hidden_dim) - cls_embed = torch.unsqueeze(cls_embed, axis=0).repeat(bs, 1, 1) # (bs, 1, hidden_dim) - vae_encoder_input = torch.cat( - [cls_embed, qpos_embed, action_embed], axis=1 - ) # (bs, seq+1, hidden_dim) - vae_encoder_input = vae_encoder_input.permute(1, 0, 2) # (seq+1, bs, hidden_dim) - # do not mask cls token - # cls_joint_is_pad = torch.full((bs, 2), False).to(qpos.device) # False: not a padding - # is_pad = torch.cat([cls_joint_is_pad, is_pad], axis=1) # (bs, seq+1) - # obtain position embedding - pos_embed = self.pos_table.clone().detach() - pos_embed = pos_embed.permute(1, 0, 2) # (seq+1, 1, hidden_dim) - # query model + if self.use_vae and self.training: + assert ( + actions is not None + ), "actions must be provided when using the variational objective in training mode." + + batch_size, _ = robot_state.shape + + # Prepare the latent for input to the transformer. + if self.use_vae and actions is not None: + # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence]. + cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size) # (B, 1, D) + robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1) # (B, 1, D) + action_embed = self.vae_encoder_action_input_proj(actions) # (B, S, D) + vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1) # (B, S+2, D) + vae_encoder_input = vae_encoder_input.permute(1, 0, 2) # (S+2, B, D) + # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case. + # Prepare fixed positional embedding. + pos_embed = self.vae_encoder_pos_enc.clone().detach().permute(1, 0, 2) # (S+2, 1, D) + # Forward pass through VAE encoder and sample the latent with the reparameterization trick. vae_encoder_output = self.vae_encoder( vae_encoder_input, pos=pos_embed - ) # , src_key_padding_mask=is_pad) + ) # , src_key_padding_mask=is_pad) # TODO(now) vae_encoder_output = vae_encoder_output[0] # take cls output only - latent_info = self.latent_proj(vae_encoder_output) - mu = latent_info[:, : self.latent_dim] - logvar = latent_info[:, self.latent_dim :] - latent_sample = reparametrize(mu, logvar) - latent_input = self.latent_out_proj(latent_sample) + latent_pdf_params = self.vae_encoder_latent_output_proj(vae_encoder_output) + mu = latent_pdf_params[:, : self.latent_dim] + logvar = latent_pdf_params[:, self.latent_dim :] + # Use reparameterization trick to sample from the latent's PDF. + latent_sample = mu + logvar.div(2).exp() * torch.randn_like(mu) else: + # When not using the VAE encoder, we set the latent to be all zeros. mu = logvar = None - latent_sample = torch.zeros([bs, self.latent_dim], dtype=torch.float32).to(qpos.device) - latent_input = self.latent_out_proj(latent_sample) + latent_sample = torch.zeros([batch_size, self.latent_dim], dtype=robot_state.dtype).to( + robot_state.device + ) - if self.backbones is not None: - # Image observation features and position embeddings - all_cam_features = [] - all_cam_pos = [] - for cam_id, _ in enumerate(self.camera_names): - features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED - features = features[0] # take the last layer feature - pos = pos[0] - all_cam_features.append(self.input_proj(features)) - all_cam_pos.append(pos) - # proprioception features - proprio_input = self.input_proj_robot_state(qpos) - # fold camera dimension into width dimension - src = torch.cat(all_cam_features, axis=3) - pos = torch.cat(all_cam_pos, axis=3) - hs = self.transformer( - src, - None, - self.pos_embed.weight, - pos, - latent_input, - proprio_input, - self.additional_pos_embed.weight, - )[0] - else: - qpos = self.input_proj_robot_state(qpos) - env_state = self.input_proj_env_state(env_state) - transformer_input = torch.cat([qpos, env_state], axis=1) # seq length = 2 - hs = self.transformer(transformer_input, None, self.pos_embed.weight, self.pos.weight)[0] - a_hat = self.action_head(hs) - is_pad_hat = self.is_pad_head(hs) - return a_hat, is_pad_hat, [mu, logvar] + # Prepare all other transformer inputs. + # Image observation features and position embeddings. + all_cam_features = [] + all_cam_pos = [] + for cam_id, _ in enumerate(self.camera_names): + # TODO(now): remove the positional embedding from the backbones. + features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED + features = features[0] # take the last layer feature + pos = pos[0] + all_cam_features.append(self.encoder_img_feat_input_proj(features)) + all_cam_pos.append(pos) + # Concatenate image observation feature maps along the width dimension. + transformer_input = torch.cat(all_cam_features, axis=3) + # TODO(now): remove the positional embedding from the backbones. + pos = torch.cat(all_cam_pos, axis=3) + robot_state_embed = self.encoder_robot_state_input_proj(robot_state) + latent_embed = self.encoder_latent_input_proj(latent_sample) + # Run the transformer and project the outputs to the action space. + transformer_output = self.transformer( + transformer_input, + query_embed=self.decoder_pos_embed.weight, + pos_embed=pos, + latent_input=latent_embed, + proprio_input=robot_state_embed, + additional_pos_embed=self.additional_pos_embed.weight, + ) + a_hat = self.action_head(transformer_output) -def mlp(input_dim, hidden_dim, output_dim, hidden_depth): - if hidden_depth == 0: - mods = [nn.Linear(input_dim, output_dim)] - else: - mods = [nn.Linear(input_dim, hidden_dim), nn.ReLU(inplace=True)] - for _ in range(hidden_depth - 1): - mods += [nn.Linear(hidden_dim, hidden_dim), nn.ReLU(inplace=True)] - mods.append(nn.Linear(hidden_dim, output_dim)) - trunk = nn.Sequential(*mods) - return trunk + return a_hat, [mu, logvar] def build_vae_encoder(args): @@ -231,7 +215,7 @@ def build(args): action_dim=args.action_dim, horizon=args.num_queries, camera_names=args.camera_names, - vae=args.vae, + use_vae=args.vae, ) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index 5cf74ae5..7d24620a 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -224,8 +224,7 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): if is_pad is not None: is_pad = is_pad[:, : self.model.num_queries] - breakpoint() - a_hat, is_pad_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad) + a_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad) all_l1 = F.l1_loss(actions, a_hat, reduction="none") l1 = all_l1.mean() if is_pad is None else (all_l1 * ~is_pad.unsqueeze(-1)).mean() @@ -240,5 +239,5 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): loss_dict["loss"] = loss_dict["l1"] return loss_dict else: - action, _, (_, _) = self.model(qpos, image, env_state) # no action, sample from prior + action, _ = self.model(qpos, image, env_state) # no action, sample from prior return action diff --git a/lerobot/common/policies/act/transformer.py b/lerobot/common/policies/act/transformer.py index 20cfc815..11d5a013 100644 --- a/lerobot/common/policies/act/transformer.py +++ b/lerobot/common/policies/act/transformer.py @@ -26,10 +26,8 @@ class Transformer(nn.Module): dropout=0.1, activation="relu", normalize_before=False, - return_intermediate_dec=False, ): super().__init__() - encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) @@ -40,9 +38,7 @@ class Transformer(nn.Module): d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) decoder_norm = nn.LayerNorm(d_model) - self.decoder = TransformerDecoder( - decoder_layer, num_decoder_layers, decoder_norm, return_intermediate=return_intermediate_dec - ) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) self._reset_parameters() @@ -57,7 +53,6 @@ class Transformer(nn.Module): def forward( self, src, - mask, query_embed, pos_embed, latent_input=None, @@ -68,10 +63,10 @@ class Transformer(nn.Module): if len(src.shape) == 4: # has H and W # flatten NxCxHxW to HWxNxC bs, c, h, w = src.shape + # Each "pixel" on the feature maps will form a token. src = src.flatten(2).permute(2, 0, 1) pos_embed = pos_embed.flatten(2).permute(2, 0, 1).repeat(1, bs, 1) query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) - # mask = mask.flatten(1) additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1) # seq, bs, dim pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0) @@ -87,9 +82,9 @@ class Transformer(nn.Module): query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) tgt = torch.zeros_like(query_embed) - memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) - hs = self.decoder(tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed) - hs = hs.transpose(1, 2) + memory = self.encoder(src, pos=pos_embed) + hs = self.decoder(tgt, memory, pos=pos_embed, query_pos=query_embed) + hs = hs.transpose(0, 1) return hs @@ -103,14 +98,12 @@ class TransformerEncoder(nn.Module): def forward( self, src, - mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): output = src for layer in self.layers: - output = layer(output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos) + output = layer(output, pos=pos) if self.norm is not None: output = self.norm(output) @@ -119,52 +112,33 @@ class TransformerEncoder(nn.Module): class TransformerDecoder(nn.Module): - def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): + def __init__(self, decoder_layer, num_layers, norm=None): super().__init__() self.layers = _get_clones(decoder_layer, num_layers) self.num_layers = num_layers self.norm = norm - self.return_intermediate = return_intermediate def forward( self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): output = tgt - intermediate = [] - for layer in self.layers: output = layer( output, memory, - tgt_mask=tgt_mask, - memory_mask=memory_mask, - tgt_key_padding_mask=tgt_key_padding_mask, - memory_key_padding_mask=memory_key_padding_mask, pos=pos, query_pos=query_pos, ) - if self.return_intermediate: - intermediate.append(self.norm(output)) if self.norm is not None: output = self.norm(output) - if self.return_intermediate: - intermediate.pop() - intermediate.append(output) - if self.return_intermediate: - return torch.stack(intermediate) - - return output.unsqueeze(0) + return output class TransformerEncoderLayer(nn.Module): @@ -192,12 +166,10 @@ class TransformerEncoderLayer(nn.Module): def forward_post( self, src, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(src, pos) - src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] + src2 = self.self_attn(q, k, value=src)[0] src = src + self.dropout1(src2) src = self.norm1(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) @@ -208,13 +180,11 @@ class TransformerEncoderLayer(nn.Module): def forward_pre( self, src, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): src2 = self.norm1(src) q = k = self.with_pos_embed(src2, pos) - src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] + src2 = self.self_attn(q, k, value=src2)[0] src = src + self.dropout1(src2) src2 = self.norm2(src) src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) @@ -224,13 +194,11 @@ class TransformerEncoderLayer(nn.Module): def forward( self, src, - src_mask: Optional[Tensor] = None, - src_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, ): if self.normalize_before: - return self.forward_pre(src, src_mask, src_key_padding_mask, pos) - return self.forward_post(src, src_mask, src_key_padding_mask, pos) + return self.forward_pre(src, pos) + return self.forward_post(src, pos) class TransformerDecoderLayer(nn.Module): @@ -262,23 +230,17 @@ class TransformerDecoderLayer(nn.Module): self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): q = k = self.with_pos_embed(tgt, query_pos) - tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] + tgt2 = self.self_attn(q, k, value=tgt)[0] tgt = tgt + self.dropout1(tgt2) tgt = self.norm1(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt, query_pos), key=self.with_pos_embed(memory, pos), value=memory, - attn_mask=memory_mask, - key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) @@ -291,24 +253,18 @@ class TransformerDecoderLayer(nn.Module): self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): tgt2 = self.norm1(tgt) q = k = self.with_pos_embed(tgt2, query_pos) - tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask)[0] + tgt2 = self.self_attn(q, k, value=tgt2)[0] tgt = tgt + self.dropout1(tgt2) tgt2 = self.norm2(tgt) tgt2 = self.multihead_attn( query=self.with_pos_embed(tgt2, query_pos), key=self.with_pos_embed(memory, pos), value=memory, - attn_mask=memory_mask, - key_padding_mask=memory_key_padding_mask, )[0] tgt = tgt + self.dropout2(tgt2) tgt2 = self.norm3(tgt) @@ -320,10 +276,6 @@ class TransformerDecoderLayer(nn.Module): self, tgt, memory, - tgt_mask: Optional[Tensor] = None, - memory_mask: Optional[Tensor] = None, - tgt_key_padding_mask: Optional[Tensor] = None, - memory_key_padding_mask: Optional[Tensor] = None, pos: Optional[Tensor] = None, query_pos: Optional[Tensor] = None, ): @@ -331,16 +283,10 @@ class TransformerDecoderLayer(nn.Module): return self.forward_pre( tgt, memory, - tgt_mask, - memory_mask, - tgt_key_padding_mask, - memory_key_padding_mask, pos, query_pos, ) - return self.forward_post( - tgt, memory, tgt_mask, memory_mask, tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos - ) + return self.forward_post(tgt, memory, pos, query_pos) def _get_clones(module, n): @@ -356,7 +302,6 @@ def build_transformer(args): num_encoder_layers=args.enc_layers, num_decoder_layers=args.dec_layers, normalize_before=args.pre_norm, - return_intermediate_dec=True, ) diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index 0244944b..1086b595 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -29,7 +29,7 @@ policy: hidden_dim: 512 dim_feedforward: 3200 enc_layers: 4 - dec_layers: 7 + dec_layers: 1 nheads: 8 #camera_names: [top, front_close, left_pillar, right_pillar] camera_names: [top] diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py new file mode 100644 index 00000000..d0c0c3e7 --- /dev/null +++ b/scripts/convert_act_weights.py @@ -0,0 +1,64 @@ +import torch + +from lerobot.common.policies.factory import make_policy +from lerobot.common.utils import init_hydra_config + +cfg = init_hydra_config( + "/home/alexander/Projects/lerobot/outputs/train/act_aloha_sim_transfer_cube_human/.hydra/config.yaml" +) + +policy = make_policy(cfg) + +state_dict = torch.load("/home/alexander/Projects/act/outputs/sim_transfer_cube_human_vae/policy_last.ckpt") + + +# Replace keys based on what they start with. + +start_replacements = [ + ("model.query_embed.weight", "model.pos_embed.weight"), + ("model.pos_table", "model.vae_encoder_pos_enc"), + ("model.pos_embed.weight", "model.decoder_pos_embed.weight"), + ("model.encoder.", "model.vae_encoder."), + ("model.encoder_action_proj.", "model.vae_encoder_action_input_proj."), + ("model.encoder_joint_proj.", "model.vae_encoder_robot_state_input_proj."), + ("model.latent_proj.", "model.vae_encoder_latent_output_proj."), + ("model.latent_proj.", "model.vae_encoder_latent_output_proj."), + ("model.input_proj.", "model.encoder_img_feat_input_proj."), + ("model.input_proj_robot_state", "model.encoder_robot_state_input_proj"), + ("model.latent_out_proj.", "model.encoder_latent_input_proj."), +] + +for to_replace, replace_with in start_replacements: + for k in list(state_dict.keys()): + if k.startswith(to_replace): + k_ = replace_with + k.removeprefix(to_replace) + state_dict[k_] = state_dict[k] + del state_dict[k] + +# Remove keys based on what they start with. + +start_removals = [ + # There is a bug that means the pretrained model doesn't even use the final decoder layers. + *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)], + "model.is_pad_head.", +] + +for to_remove in start_removals: + for k in list(state_dict.keys()): + if k.startswith(to_remove): + del state_dict[k] + +missing_keys, unexpected_keys = policy.load_state_dict(state_dict, strict=False) + +if len(missing_keys) != 0: + print("MISSING KEYS") + print(missing_keys) +if len(unexpected_keys) != 0: + print("UNEXPECTED KEYS") + print(unexpected_keys) + +# if len(missing_keys) != 0 or len(unexpected_keys) != 0: +# print("Failed due to mismatch in state dicts.") +# exit() + +policy.save("/tmp/weights.pth") From 278336a39a32ec0a7f7af87dac5b65c21368e488 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Wed, 3 Apr 2024 19:23:22 +0100 Subject: [PATCH 04/25] backup wip --- lerobot/common/policies/act/detr_vae.py | 85 ++--- lerobot/common/policies/act/transformer.py | 350 ++++++++------------- pyproject.toml | 3 + 3 files changed, 185 insertions(+), 253 deletions(-) diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py index ff137a34..aaf4d098 100644 --- a/lerobot/common/policies/act/detr_vae.py +++ b/lerobot/common/policies/act/detr_vae.py @@ -4,7 +4,7 @@ import torch from torch import nn from .backbone import build_backbone -from .transformer import TransformerEncoder, TransformerEncoderLayer, build_transformer +from .transformer import Transformer, TransformerEncoder def get_sinusoid_encoding_table(n_position, d_hid): @@ -124,16 +124,14 @@ class ActionChunkingTransformer(nn.Module): robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1) # (B, 1, D) action_embed = self.vae_encoder_action_input_proj(actions) # (B, S, D) vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1) # (B, S+2, D) - vae_encoder_input = vae_encoder_input.permute(1, 0, 2) # (S+2, B, D) # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case. # Prepare fixed positional embedding. - pos_embed = self.vae_encoder_pos_enc.clone().detach().permute(1, 0, 2) # (S+2, 1, D) + pos_embed = self.vae_encoder_pos_enc.clone().detach() # (1, S+2, D) # Forward pass through VAE encoder and sample the latent with the reparameterization trick. - vae_encoder_output = self.vae_encoder( - vae_encoder_input, pos=pos_embed - ) # , src_key_padding_mask=is_pad) # TODO(now) - vae_encoder_output = vae_encoder_output[0] # take cls output only - latent_pdf_params = self.vae_encoder_latent_output_proj(vae_encoder_output) + cls_token_out = self.vae_encoder( + vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2) + )[0] # (B, D) + latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out) mu = latent_pdf_params[:, : self.latent_dim] logvar = latent_pdf_params[:, self.latent_dim :] # Use reparameterization trick to sample from the latent's PDF. @@ -151,10 +149,11 @@ class ActionChunkingTransformer(nn.Module): all_cam_pos = [] for cam_id, _ in enumerate(self.camera_names): # TODO(now): remove the positional embedding from the backbones. - features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED - features = features[0] # take the last layer feature + cam_features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED + cam_features = cam_features[0] # take the last layer feature pos = pos[0] - all_cam_features.append(self.encoder_img_feat_input_proj(features)) + cam_features = self.encoder_img_feat_input_proj(cam_features) # (B, C, h, w) + all_cam_features.append(cam_features) all_cam_pos.append(pos) # Concatenate image observation feature maps along the width dimension. transformer_input = torch.cat(all_cam_features, axis=3) @@ -163,36 +162,25 @@ class ActionChunkingTransformer(nn.Module): robot_state_embed = self.encoder_robot_state_input_proj(robot_state) latent_embed = self.encoder_latent_input_proj(latent_sample) + # TODO(now): Explain all of this madness. + transformer_input = torch.cat( + [ + torch.stack([latent_embed, robot_state_embed], axis=0), + transformer_input.flatten(2).permute(2, 0, 1), + ] + ) + pos_embed = torch.cat( + [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0 + ) + # Run the transformer and project the outputs to the action space. transformer_output = self.transformer( transformer_input, - query_embed=self.decoder_pos_embed.weight, - pos_embed=pos, - latent_input=latent_embed, - proprio_input=robot_state_embed, - additional_pos_embed=self.additional_pos_embed.weight, - ) - a_hat = self.action_head(transformer_output) - - return a_hat, [mu, logvar] - - -def build_vae_encoder(args): - d_model = args.hidden_dim # 256 - dropout = args.dropout # 0.1 - nhead = args.nheads # 8 - dim_feedforward = args.dim_feedforward # 2048 - num_encoder_layers = args.enc_layers # 4 # TODO shared with VAE decoder - normalize_before = args.pre_norm # False - activation = "relu" - - encoder_layer = TransformerEncoderLayer( - d_model, nhead, dim_feedforward, dropout, activation, normalize_before - ) - encoder_norm = nn.LayerNorm(d_model) if normalize_before else None - encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) - - return encoder + encoder_pos=pos_embed, + decoder_pos=self.decoder_pos_embed.weight.unsqueeze(1), + ).transpose(0, 1) # back to (B, S, C) + actions = self.action_head(transformer_output) + return actions, [mu, logvar] def build(args): @@ -203,9 +191,26 @@ def build(args): backbone = build_backbone(args) backbones.append(backbone) - transformer = build_transformer(args) + transformer = Transformer( + d_model=args.hidden_dim, + dropout=args.dropout, + nhead=args.nheads, + dim_feedforward=args.dim_feedforward, + num_encoder_layers=args.enc_layers, + num_decoder_layers=args.dec_layers, + normalize_before=args.pre_norm, + ) - vae_encoder = build_vae_encoder(args) + # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder + vae_encoder = TransformerEncoder( + num_layers=args.enc_layers, + d_model=args.hidden_dim, + nhead=args.nheads, + dim_feedforward=args.dim_feedforward, + dropout=args.dropout, + activation="relu", + normalize_before=args.pre_norm, + ) model = ActionChunkingTransformer( backbones, diff --git a/lerobot/common/policies/act/transformer.py b/lerobot/common/policies/act/transformer.py index 11d5a013..7e71f3ea 100644 --- a/lerobot/common/policies/act/transformer.py +++ b/lerobot/common/policies/act/transformer.py @@ -1,13 +1,7 @@ """ -DETR Transformer class. - -Copy-paste from torch.nn.Transformer with modifications: - * positional encodings are passed in MHattention - * extra LN at the end of encoder is removed - * decoder returns a stack of activations from all decoding layers +TODO(now) """ -import copy from typing import Optional import torch @@ -28,117 +22,68 @@ class Transformer(nn.Module): normalize_before=False, ): super().__init__() - encoder_layer = TransformerEncoderLayer( - d_model, nhead, dim_feedforward, dropout, activation, normalize_before + self.encoder = TransformerEncoder( + num_encoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) - encoder_norm = nn.LayerNorm(d_model) if normalize_before else None - self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) - - decoder_layer = TransformerDecoderLayer( - d_model, nhead, dim_feedforward, dropout, activation, normalize_before + self.decoder = TransformerDecoder( + num_decoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before ) - decoder_norm = nn.LayerNorm(d_model) - self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) - - self._reset_parameters() - self.d_model = d_model self.nhead = nhead + self._init_params() # TODO(now): move to somewhere common - def _reset_parameters(self): + def _init_params(self): for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) - def forward( - self, - src, - query_embed, - pos_embed, - latent_input=None, - proprio_input=None, - additional_pos_embed=None, - ): + def forward(self, x, encoder_pos, decoder_pos): + """ + Args: + x: ((E)ncoder (S)equence, (B)atch, (C)hannels) + decoder_pos: (Decoder Sequence, C) tensor for the decoder's positional embedding. + encoder_pos: (ES, C) tenso + """ # TODO flatten only when input has H and W - if len(src.shape) == 4: # has H and W - # flatten NxCxHxW to HWxNxC - bs, c, h, w = src.shape - # Each "pixel" on the feature maps will form a token. - src = src.flatten(2).permute(2, 0, 1) - pos_embed = pos_embed.flatten(2).permute(2, 0, 1).repeat(1, bs, 1) - query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) + bs = x.shape[1] - additional_pos_embed = additional_pos_embed.unsqueeze(1).repeat(1, bs, 1) # seq, bs, dim - pos_embed = torch.cat([additional_pos_embed, pos_embed], axis=0) - - addition_input = torch.stack([latent_input, proprio_input], axis=0) - src = torch.cat([addition_input, src], axis=0) - else: - assert len(src.shape) == 3 - # flatten NxHWxC to HWxNxC - bs, hw, c = src.shape - src = src.permute(1, 0, 2) - pos_embed = pos_embed.unsqueeze(1).repeat(1, bs, 1) - query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) - - tgt = torch.zeros_like(query_embed) - memory = self.encoder(src, pos=pos_embed) - hs = self.decoder(tgt, memory, pos=pos_embed, query_pos=query_embed) - hs = hs.transpose(0, 1) - return hs + encoder_out = self.encoder(x, pos=encoder_pos) + decoder_in = torch.zeros( + (decoder_pos.shape[0], bs, decoder_pos.shape[2]), + dtype=decoder_pos.dtype, + device=decoder_pos.device, + ) + decoder_out = self.decoder(decoder_in, encoder_out, encoder_pos=encoder_pos, decoder_pos=decoder_pos) + return decoder_out class TransformerEncoder(nn.Module): - def __init__(self, encoder_layer, num_layers, norm=None): - super().__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward( + def __init__( self, - src, - pos: Optional[Tensor] = None, + num_layers, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, ): - output = src - - for layer in self.layers: - output = layer(output, pos=pos) - - if self.norm is not None: - output = self.norm(output) - - return output - - -class TransformerDecoder(nn.Module): - def __init__(self, decoder_layer, num_layers, norm=None): super().__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward( - self, - tgt, - memory, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None, - ): - output = tgt + self.layers = nn.ModuleList( + [ + TransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + for _ in range(num_layers) + ] + ) + self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity() + def forward(self, x, pos: Optional[Tensor] = None): for layer in self.layers: - output = layer( - output, - memory, - pos=pos, - query_pos=query_pos, - ) - - if self.norm is not None: - output = self.norm(output) - - return output + x = layer(x, pos=pos) + x = self.norm(x) + return x class TransformerEncoderLayer(nn.Module): @@ -160,45 +105,55 @@ class TransformerEncoderLayer(nn.Module): self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before - def with_pos_embed(self, tensor, pos: Optional[Tensor]): - return tensor if pos is None else tensor + pos - - def forward_post( - self, - src, - pos: Optional[Tensor] = None, - ): - q = k = self.with_pos_embed(src, pos) - src2 = self.self_attn(q, k, value=src)[0] - src = src + self.dropout1(src2) - src = self.norm1(src) - src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) - src = src + self.dropout2(src2) - src = self.norm2(src) - return src - - def forward_pre( - self, - src, - pos: Optional[Tensor] = None, - ): - src2 = self.norm1(src) - q = k = self.with_pos_embed(src2, pos) - src2 = self.self_attn(q, k, value=src2)[0] - src = src + self.dropout1(src2) - src2 = self.norm2(src) - src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) - src = src + self.dropout2(src2) - return src - - def forward( - self, - src, - pos: Optional[Tensor] = None, - ): + def forward(self, x, pos: Optional[Tensor] = None): + skip = x if self.normalize_before: - return self.forward_pre(src, pos) - return self.forward_post(src, pos) + x = self.norm1(x) + q = k = x if pos is None else x + pos + x = self.self_attn(q, k, value=x)[0] + x = skip + self.dropout1(x) + if self.normalize_before: + skip = x + x = self.norm2(x) + else: + x = self.norm1(x) + skip = x + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + x = skip + self.dropout2(x) + if not self.normalize_before: + x = self.norm2(x) + return x + + +class TransformerDecoder(nn.Module): + def __init__( + self, + num_layers, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.layers = nn.ModuleList( + [ + TransformerDecoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + for _ in range(num_layers) + ] + ) + self.num_layers = num_layers + self.norm = nn.LayerNorm(d_model) + + def forward(self, x, encoder_out, decoder_pos: Tensor | None = None, encoder_pos: Tensor | None = None): + for layer in self.layers: + x = layer(x, encoder_out, decoder_pos=decoder_pos, encoder_pos=encoder_pos) + if self.norm is not None: + x = self.norm(x) + return x class TransformerDecoderLayer(nn.Module): @@ -223,86 +178,55 @@ class TransformerDecoderLayer(nn.Module): self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before - def with_pos_embed(self, tensor, pos: Optional[Tensor]): + def maybe_add_pos_embed(self, tensor: Tensor, pos: Tensor | None) -> Tensor: return tensor if pos is None else tensor + pos - def forward_post( - self, - tgt, - memory, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None, - ): - q = k = self.with_pos_embed(tgt, query_pos) - tgt2 = self.self_attn(q, k, value=tgt)[0] - tgt = tgt + self.dropout1(tgt2) - tgt = self.norm1(tgt) - tgt2 = self.multihead_attn( - query=self.with_pos_embed(tgt, query_pos), - key=self.with_pos_embed(memory, pos), - value=memory, - )[0] - tgt = tgt + self.dropout2(tgt2) - tgt = self.norm2(tgt) - tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) - tgt = tgt + self.dropout3(tgt2) - tgt = self.norm3(tgt) - return tgt - - def forward_pre( - self, - tgt, - memory, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None, - ): - tgt2 = self.norm1(tgt) - q = k = self.with_pos_embed(tgt2, query_pos) - tgt2 = self.self_attn(q, k, value=tgt2)[0] - tgt = tgt + self.dropout1(tgt2) - tgt2 = self.norm2(tgt) - tgt2 = self.multihead_attn( - query=self.with_pos_embed(tgt2, query_pos), - key=self.with_pos_embed(memory, pos), - value=memory, - )[0] - tgt = tgt + self.dropout2(tgt2) - tgt2 = self.norm3(tgt) - tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) - tgt = tgt + self.dropout3(tgt2) - return tgt - def forward( self, - tgt, - memory, - pos: Optional[Tensor] = None, - query_pos: Optional[Tensor] = None, - ): + x: Tensor, + encoder_out: Tensor, + decoder_pos: Tensor | None = None, + encoder_pos: Tensor | None = None, + ) -> Tensor: + """ + Args: + x: (Decoder Sequence, Batch, Channel) tensor of input tokens. + encoder_out: (Encoder Sequence, B, C) output features from the last layer of the encoder we are + cross-attending with. + decoder_pos: (ES, 1, C) positional embedding for keys (from the encoder). + encoder_pos: (DS, 1, C) Positional_embedding for the queries (from the decoder). + Returns: + (DS, B, C) tensor of decoder output features. + """ + skip = x if self.normalize_before: - return self.forward_pre( - tgt, - memory, - pos, - query_pos, - ) - return self.forward_post(tgt, memory, pos, query_pos) - - -def _get_clones(module, n): - return nn.ModuleList([copy.deepcopy(module) for _ in range(n)]) - - -def build_transformer(args): - return Transformer( - d_model=args.hidden_dim, - dropout=args.dropout, - nhead=args.nheads, - dim_feedforward=args.dim_feedforward, - num_encoder_layers=args.enc_layers, - num_decoder_layers=args.dec_layers, - normalize_before=args.pre_norm, - ) + x = self.norm1(x) + q = k = self.maybe_add_pos_embed(x, decoder_pos) + x = self.self_attn(q, k, value=x)[0] + x = skip + self.dropout1(x) + if self.normalize_before: + skip = x + x = self.norm2(x) + else: + x = self.norm1(x) + skip = x + x = self.multihead_attn( + query=self.maybe_add_pos_embed(x, decoder_pos), + key=self.maybe_add_pos_embed(encoder_out, encoder_pos), + value=encoder_out, + )[0] + x = skip + self.dropout2(x) + if self.normalize_before: + skip = x + x = self.norm3(x) + else: + x = self.norm2(x) + skip = x + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + x = skip + self.dropout3(x) + if not self.normalize_before: + x = self.norm3(x) + return x def _get_activation_fn(activation): @@ -313,4 +237,4 @@ def _get_activation_fn(activation): return F.gelu if activation == "glu": return F.glu - raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + raise RuntimeError(f"activation should be relu/gelu/glu, not {activation}.") diff --git a/pyproject.toml b/pyproject.toml index b2526e5c..6d76cffc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,3 +101,6 @@ enable = true [build-system] requires = ["poetry-core>=1.0.0", "poetry-dynamic-versioning>=1.0.0,<2.0.0"] build-backend = "poetry_dynamic_versioning.backend" + +[tool.black] +line-length = 110 From 3a4dfa82fe8393e0a401f28d97efa3fd2cac9a05 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Thu, 4 Apr 2024 18:34:41 +0100 Subject: [PATCH 05/25] backup wip --- lerobot/common/policies/act/backbone.py | 115 ---- lerobot/common/policies/act/detr_vae.py | 229 ------- lerobot/common/policies/act/policy.py | 570 ++++++++++++++++-- .../common/policies/act/position_encoding.py | 102 ---- lerobot/common/policies/act/transformer.py | 240 -------- lerobot/common/policies/act/utils.py | 478 --------------- lerobot/configs/policy/act.yaml | 3 +- scripts/convert_act_weights.py | 28 +- 8 files changed, 538 insertions(+), 1227 deletions(-) delete mode 100644 lerobot/common/policies/act/backbone.py delete mode 100644 lerobot/common/policies/act/detr_vae.py delete mode 100644 lerobot/common/policies/act/position_encoding.py delete mode 100644 lerobot/common/policies/act/transformer.py delete mode 100644 lerobot/common/policies/act/utils.py diff --git a/lerobot/common/policies/act/backbone.py b/lerobot/common/policies/act/backbone.py deleted file mode 100644 index 6399d339..00000000 --- a/lerobot/common/policies/act/backbone.py +++ /dev/null @@ -1,115 +0,0 @@ -from typing import List - -import torch -import torchvision -from torch import nn -from torchvision.models._utils import IntermediateLayerGetter - -from .position_encoding import build_position_encoding -from .utils import NestedTensor, is_main_process - - -class FrozenBatchNorm2d(torch.nn.Module): - """ - BatchNorm2d where the batch statistics and the affine parameters are fixed. - - Copy-paste from torchvision.misc.ops with added eps before rqsrt, - without which any other policy_models than torchvision.policy_models.resnet[18,34,50,101] - produce nans. - """ - - def __init__(self, n): - super().__init__() - self.register_buffer("weight", torch.ones(n)) - self.register_buffer("bias", torch.zeros(n)) - self.register_buffer("running_mean", torch.zeros(n)) - self.register_buffer("running_var", torch.ones(n)) - - def _load_from_state_dict( - self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs - ): - num_batches_tracked_key = prefix + "num_batches_tracked" - if num_batches_tracked_key in state_dict: - del state_dict[num_batches_tracked_key] - - super()._load_from_state_dict( - state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs - ) - - def forward(self, x): - # move reshapes to the beginning - # to make it fuser-friendly - w = self.weight.reshape(1, -1, 1, 1) - b = self.bias.reshape(1, -1, 1, 1) - rv = self.running_var.reshape(1, -1, 1, 1) - rm = self.running_mean.reshape(1, -1, 1, 1) - eps = 1e-5 - scale = w * (rv + eps).rsqrt() - bias = b - rm * scale - return x * scale + bias - - -class BackboneBase(nn.Module): - def __init__( - self, backbone: nn.Module, train_backbone: bool, num_channels: int, return_interm_layers: bool - ): - super().__init__() - # for name, parameter in backbone.named_parameters(): # only train later layers # TODO do we want this? - # if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: - # parameter.requires_grad_(False) - if return_interm_layers: - return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} - else: - return_layers = {"layer4": "0"} - self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) - self.num_channels = num_channels - - def forward(self, tensor): - xs = self.body(tensor) - return xs - # out: Dict[str, NestedTensor] = {} - # for name, x in xs.items(): - # m = tensor_list.mask - # assert m is not None - # mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] - # out[name] = NestedTensor(x, mask) - # return out - - -class Backbone(BackboneBase): - """ResNet backbone with frozen BatchNorm.""" - - def __init__(self, name: str, train_backbone: bool, return_interm_layers: bool, dilation: bool): - backbone = getattr(torchvision.models, name)( - replace_stride_with_dilation=[False, False, dilation], - pretrained=is_main_process(), - norm_layer=FrozenBatchNorm2d, - ) # pretrained # TODO do we want frozen batch_norm?? - num_channels = 512 if name in ("resnet18", "resnet34") else 2048 - super().__init__(backbone, train_backbone, num_channels, return_interm_layers) - - -class Joiner(nn.Sequential): - def __init__(self, backbone, position_embedding): - super().__init__(backbone, position_embedding) - - def forward(self, tensor_list: NestedTensor): - xs = self[0](tensor_list) - out: List[NestedTensor] = [] - pos = [] - for _, x in xs.items(): - out.append(x) - # position encoding - pos.append(self[1](x).to(x.dtype)) - - return out, pos - - -def build_backbone(args): - position_embedding = build_position_encoding(args) - train_backbone = args.lr_backbone > 0 - return_interm_layers = args.masks - backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) - model = Joiner(backbone, position_embedding) - model.num_channels = backbone.num_channels - return model diff --git a/lerobot/common/policies/act/detr_vae.py b/lerobot/common/policies/act/detr_vae.py deleted file mode 100644 index aaf4d098..00000000 --- a/lerobot/common/policies/act/detr_vae.py +++ /dev/null @@ -1,229 +0,0 @@ -import einops -import numpy as np -import torch -from torch import nn - -from .backbone import build_backbone -from .transformer import Transformer, TransformerEncoder - - -def get_sinusoid_encoding_table(n_position, d_hid): - def get_position_angle_vec(position): - return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] - - sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) - sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i - sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 - - return torch.FloatTensor(sinusoid_table).unsqueeze(0) - - -class ActionChunkingTransformer(nn.Module): - """ - Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware - (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act) - - Note: In this code we use the symbols `vae_encoder`, 'encoder', `decoder`. The meanings are as follows. - - The `vae_encoder` is, as per the literature around conditional variational auto-encoders (cVAE), the - part of the model that encodes the target data (here, a sequence of actions), and the condition - (here, we include the robot joint-space state as an input to the encoder). - - The `transformer` is the cVAE's decoder. But since we have an option to train this model without the - variational objective (in which case we drop the `vae_encoder` altogether), we don't call it the - `vae_decoder`. - # TODO(now): remove the following - - The `encoder` is actually a component of the cVAE's "decoder". But we refer to it as an "encoder" - because, in terms of the transformer with cross-attention that forms the cVAE's decoder, it is the - "encoder" part. We drop the `vae_` prefix because we have an option to train this model without the - variational objective (in which case we drop the `vae_encoder` altogether), and nothing about this - model has anything to do with a VAE). - - The `decoder` is a building block of the VAE decoder, and is just the "decoder" part of a - transformer with cross-attention. For the same reasoning behind the naming of `encoder`, we make - this term agnostic to the option to use a variational objective for training. - - """ - - def __init__( - self, backbones, transformer, vae_encoder, state_dim, action_dim, horizon, camera_names, use_vae - ): - """Initializes the model. - Parameters: - backbones: torch module of the backbone to be used. See backbone.py - transformer: torch module of the transformer architecture. See transformer.py - state_dim: robot state dimension of the environment - horizon: number of object queries, ie detection slot. This is the maximal number of objects - DETR can detect in a single image. For COCO, we recommend 100 queries. - - Args: - state_dim: Robot positional state dimension. - action_dim: Action dimension. - horizon: The number of actions to generate in one forward pass. - use_vae: Whether to use the variational objective. TODO(now): Give more details. - """ - super().__init__() - - self.camera_names = camera_names - self.transformer = transformer - self.vae_encoder = vae_encoder - self.use_vae = use_vae - hidden_dim = transformer.d_model - - # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence]. - # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]). - if use_vae: - self.cls_embed = nn.Embedding(1, hidden_dim) - # Projection layer for joint-space configuration to hidden dimension. - self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim) - # Projection layer for action (joint-space target) to hidden dimension. - self.vae_encoder_action_input_proj = nn.Linear(state_dim, hidden_dim) - # Final size of latent z. TODO(now): Add to hyperparams. - self.latent_dim = 32 - # Projection layer from the VAE encoder's output to the latent distribution's parameter space. - self.vae_encoder_latent_output_proj = nn.Linear(hidden_dim, self.latent_dim * 2) - # Fixed sinusoidal positional embedding the whole input to the VAE encoder. - self.register_buffer( - "vae_encoder_pos_enc", get_sinusoid_encoding_table(1 + 1 + horizon, hidden_dim) - ) - - # Transformer encoder input projections. The tokens will be structured like - # [latent, robot_state, image_feature_map_pixels]. - self.backbones = nn.ModuleList(backbones) - self.encoder_img_feat_input_proj = nn.Conv2d(backbones[0].num_channels, hidden_dim, kernel_size=1) - self.encoder_robot_state_input_proj = nn.Linear(state_dim, hidden_dim) - self.encoder_latent_input_proj = nn.Linear(self.latent_dim, hidden_dim) - # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image - # feature dimension with a dry run. - self.additional_pos_embed = nn.Embedding( - 2, hidden_dim - ) # learned position embedding for proprio and latent - - # Transformer decoder. - # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries). - self.decoder_pos_embed = nn.Embedding(horizon, hidden_dim) - # Final action regression head on the output of the transformer's decoder. - self.action_head = nn.Linear(hidden_dim, action_dim) - - def forward(self, robot_state, image, actions=None): - """ - Args: - robot_state: (B, J) batch of robot joint configurations. - image: (B, N, C, H, W) batch of N camera frames. - actions: (B, S, A) batch of actions from the target dataset which must be provided if the - VAE is enabled and the model is in training mode. - """ - if self.use_vae and self.training: - assert ( - actions is not None - ), "actions must be provided when using the variational objective in training mode." - - batch_size, _ = robot_state.shape - - # Prepare the latent for input to the transformer. - if self.use_vae and actions is not None: - # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence]. - cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size) # (B, 1, D) - robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1) # (B, 1, D) - action_embed = self.vae_encoder_action_input_proj(actions) # (B, S, D) - vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1) # (B, S+2, D) - # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case. - # Prepare fixed positional embedding. - pos_embed = self.vae_encoder_pos_enc.clone().detach() # (1, S+2, D) - # Forward pass through VAE encoder and sample the latent with the reparameterization trick. - cls_token_out = self.vae_encoder( - vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2) - )[0] # (B, D) - latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out) - mu = latent_pdf_params[:, : self.latent_dim] - logvar = latent_pdf_params[:, self.latent_dim :] - # Use reparameterization trick to sample from the latent's PDF. - latent_sample = mu + logvar.div(2).exp() * torch.randn_like(mu) - else: - # When not using the VAE encoder, we set the latent to be all zeros. - mu = logvar = None - latent_sample = torch.zeros([batch_size, self.latent_dim], dtype=robot_state.dtype).to( - robot_state.device - ) - - # Prepare all other transformer inputs. - # Image observation features and position embeddings. - all_cam_features = [] - all_cam_pos = [] - for cam_id, _ in enumerate(self.camera_names): - # TODO(now): remove the positional embedding from the backbones. - cam_features, pos = self.backbones[0](image[:, cam_id]) # HARDCODED - cam_features = cam_features[0] # take the last layer feature - pos = pos[0] - cam_features = self.encoder_img_feat_input_proj(cam_features) # (B, C, h, w) - all_cam_features.append(cam_features) - all_cam_pos.append(pos) - # Concatenate image observation feature maps along the width dimension. - transformer_input = torch.cat(all_cam_features, axis=3) - # TODO(now): remove the positional embedding from the backbones. - pos = torch.cat(all_cam_pos, axis=3) - robot_state_embed = self.encoder_robot_state_input_proj(robot_state) - latent_embed = self.encoder_latent_input_proj(latent_sample) - - # TODO(now): Explain all of this madness. - transformer_input = torch.cat( - [ - torch.stack([latent_embed, robot_state_embed], axis=0), - transformer_input.flatten(2).permute(2, 0, 1), - ] - ) - pos_embed = torch.cat( - [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0 - ) - - # Run the transformer and project the outputs to the action space. - transformer_output = self.transformer( - transformer_input, - encoder_pos=pos_embed, - decoder_pos=self.decoder_pos_embed.weight.unsqueeze(1), - ).transpose(0, 1) # back to (B, S, C) - actions = self.action_head(transformer_output) - return actions, [mu, logvar] - - -def build(args): - # From state - # backbone = None # from state for now, no need for conv nets - # From image - backbones = [] - backbone = build_backbone(args) - backbones.append(backbone) - - transformer = Transformer( - d_model=args.hidden_dim, - dropout=args.dropout, - nhead=args.nheads, - dim_feedforward=args.dim_feedforward, - num_encoder_layers=args.enc_layers, - num_decoder_layers=args.dec_layers, - normalize_before=args.pre_norm, - ) - - # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder - vae_encoder = TransformerEncoder( - num_layers=args.enc_layers, - d_model=args.hidden_dim, - nhead=args.nheads, - dim_feedforward=args.dim_feedforward, - dropout=args.dropout, - activation="relu", - normalize_before=args.pre_norm, - ) - - model = ActionChunkingTransformer( - backbones, - transformer, - vae_encoder, - state_dim=args.state_dim, - action_dim=args.action_dim, - horizon=args.num_queries, - camera_names=args.camera_names, - use_vae=args.vae, - ) - - n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) - print("number of parameters: {:.2f}M".format(n_parameters / 1e6)) - - return model diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index 7d24620a..906ea0cd 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -1,50 +1,32 @@ -import logging -import time +"""Action Chunking Transformer Policy +As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705). +""" + +import logging +import math +import time +from itertools import chain +from typing import Callable, Optional + +import einops +import numpy as np import torch import torch.nn.functional as F # noqa: N812 +import torchvision import torchvision.transforms as transforms +from torch import Tensor, nn +from torchvision.models._utils import IntermediateLayerGetter +from torchvision.ops.misc import FrozenBatchNorm2d from lerobot.common.policies.abstract import AbstractPolicy -from lerobot.common.policies.act.detr_vae import build from lerobot.common.utils import get_safe_torch_device -def build_act_model_and_optimizer(cfg): - model = build(cfg) - - param_dicts = [ - {"params": [p for n, p in model.named_parameters() if "backbone" not in n and p.requires_grad]}, - { - "params": [p for n, p in model.named_parameters() if "backbone" in n and p.requires_grad], - "lr": cfg.lr_backbone, - }, - ] - optimizer = torch.optim.AdamW(param_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay) - - return model, optimizer - - -def kl_divergence(mu, logvar): - batch_size = mu.size(0) - assert batch_size != 0 - if mu.data.ndimension() == 4: - mu = mu.view(mu.size(0), mu.size(1)) - if logvar.data.ndimension() == 4: - logvar = logvar.view(logvar.size(0), logvar.size(1)) - - klds = -0.5 * (1 + logvar - mu.pow(2) - logvar.exp()) - total_kld = klds.sum(1).mean(0, True) - dimension_wise_kld = klds.mean(0) - mean_kld = klds.mean(1).mean(0, True) - - return total_kld, dimension_wise_kld, mean_kld - - class ActionChunkingTransformerPolicy(AbstractPolicy): """ - Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware - (https://arxiv.org/abs/2304.13705). + Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost + Hardware (https://arxiv.org/abs/2304.13705). """ name = "act" @@ -68,7 +50,35 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): self.cfg = cfg self.n_action_steps = n_action_steps self.device = get_safe_torch_device(device) - self.model, self.optimizer = build_act_model_and_optimizer(cfg) + + self.model = ActionChunkingTransformer( + cfg, + state_dim=cfg.state_dim, + action_dim=cfg.action_dim, + horizon=cfg.horizon, + camera_names=cfg.camera_names, + use_vae=cfg.vae, + ) + + optimizer_params_dicts = [ + { + "params": [ + p + for n, p in self.model.named_parameters() + if not n.startswith("backbone") and p.requires_grad + ] + }, + { + "params": [ + p + for n, p in self.model.named_parameters() + if n.startswith("backbone") and p.requires_grad + ], + "lr": cfg.lr_backbone, + }, + ] + self.optimizer = torch.optim.AdamW(optimizer_params_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay) + self.kl_weight = self.cfg.kl_weight logging.info(f"KL Weight {self.kl_weight}") self.to(self.device) @@ -140,12 +150,10 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): self.optimizer.step() self.optimizer.zero_grad() - # self.lr_scheduler.step() info = { "loss": loss.item(), "grad_norm": float(grad_norm), - # "lr": self.lr_scheduler.get_last_lr()[0], "lr": self.cfg.lr, "data_s": data_s, "update_s": time.time() - start_time, @@ -213,31 +221,495 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): action = action[: self.n_action_steps] return action - def _forward(self, qpos, image, actions=None, is_pad=None): - env_state = None + def _forward(self, qpos, image, actions=None): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) image = normalize(image) is_training = actions is not None if is_training: # training time - actions = actions[:, : self.model.num_queries] - if is_pad is not None: - is_pad = is_pad[:, : self.model.num_queries] + actions = actions[:, : self.model.horizon] - a_hat, (mu, logvar) = self.model(qpos, image, env_state, actions, is_pad) + a_hat, (mu, log_sigma_x2) = self.model(qpos, image, actions) all_l1 = F.l1_loss(actions, a_hat, reduction="none") - l1 = all_l1.mean() if is_pad is None else (all_l1 * ~is_pad.unsqueeze(-1)).mean() + l1 = all_l1.mean() loss_dict = {} loss_dict["l1"] = l1 if self.cfg.vae: - total_kld, dim_wise_kld, mean_kld = kl_divergence(mu, logvar) - loss_dict["kl"] = total_kld[0] + # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for + # each dimension independently, we sum over the latent dimension to get the total + # KL-divergence per batch element, then take the mean over the batch. + # (See App. B of https://arxiv.org/abs/1312.6114 for more details). + mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean() + loss_dict["kl"] = mean_kld loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.kl_weight else: loss_dict["loss"] = loss_dict["l1"] return loss_dict else: - action, _ = self.model(qpos, image, env_state) # no action, sample from prior + action, _ = self.model(qpos, image) # no action, sample from prior return action + + +def create_sinusoidal_position_embedding(n_position, d_hid): + def get_position_angle_vec(position): + return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] + + sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + + return torch.FloatTensor(sinusoid_table).unsqueeze(0) + + +# TODO(alexander-soare) move all this code into the policy when we have the policy API established. +class ActionChunkingTransformer(nn.Module): + """ + Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware + (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act) + + Note: In this code we use the terms `vae_encoder`, 'encoder', `decoder`. The meanings are as follows. + - The `vae_encoder` is, as per the literature around variational auto-encoders (VAE), the part of the + model that encodes the target data (a sequence of actions), and the condition (the robot + joint-space). + - A transformer with an `encoder` (not the VAE encoder) and `decoder` (not the VAE decoder) with + cross-attention is used as the VAE decoder. For these terms, we drop the `vae_` prefix because we + have an option to train this model without the variational objective (in which case we drop the + `vae_encoder` altogether, and nothing about this model has anything to do with a VAE). + + Transformer + Used alone for inference + (acts as VAE decoder + during training) + ┌───────────────────────┐ + │ Outputs │ + │ ▲ │ + │ ┌─────►┌───────┐ │ + ┌──────┐ │ │ │Transf.│ │ + │ │ │ ├─────►│decoder│ │ + ┌────┴────┐ │ │ │ │ │ │ + │ │ │ │ ┌───┴───┬─►│ │ │ + │ VAE │ │ │ │ │ └───────┘ │ + │ encoder │ │ │ │Transf.│ │ + │ │ │ │ │encoder│ │ + └───▲─────┘ │ │ │ │ │ + │ │ │ └───▲───┘ │ + │ │ │ │ │ + inputs └─────┼─────┘ │ + │ │ + └───────────────────────┘ + """ + + def __init__(self, args, state_dim, action_dim, horizon, camera_names, use_vae): + """Initializes the model. + Parameters: + state_dim: robot state dimension of the environment + horizon: number of object queries, ie detection slot. This is the maximal number of objects + DETR can detect in a single image. For COCO, we recommend 100 queries. + + Args: + state_dim: Robot positional state dimension. + action_dim: Action dimension. + horizon: The number of actions to generate in one forward pass. + use_vae: Whether to use the variational objective. TODO(now): Give more details. + """ + super().__init__() + + self.camera_names = camera_names + self.use_vae = use_vae + self.horizon = horizon + self.hidden_dim = args.hidden_dim + + transformer_common_kwargs = dict( # noqa: C408 + d_model=self.hidden_dim, + nhead=args.nheads, + dim_feedforward=args.dim_feedforward, + dropout=args.dropout, + activation=args.activation, + normalize_before=args.pre_norm, + ) + + # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence]. + # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]). + if use_vae: + # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder + self.vae_encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs) + self.cls_embed = nn.Embedding(1, self.hidden_dim) + # Projection layer for joint-space configuration to hidden dimension. + self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim) + # Projection layer for action (joint-space target) to hidden dimension. + self.vae_encoder_action_input_proj = nn.Linear(state_dim, self.hidden_dim) + # Final size of latent z. TODO(now): Add to hyperparams. + self.latent_dim = 32 + # Projection layer from the VAE encoder's output to the latent distribution's parameter space. + self.vae_encoder_latent_output_proj = nn.Linear(self.hidden_dim, self.latent_dim * 2) + # Fixed sinusoidal positional embedding the whole input to the VAE encoder. + self.register_buffer( + "vae_encoder_pos_enc", create_sinusoidal_position_embedding(1 + 1 + horizon, self.hidden_dim) + ) + + # Backbone for image feature extraction. + self.backbone_position_embedding = SinusoidalPositionEmbedding2D(self.hidden_dim // 2) + backbone_model = getattr(torchvision.models, args.backbone)( + replace_stride_with_dilation=[False, False, args.dilation], + pretrained=True, # TODO(now): Add pretrained option + norm_layer=FrozenBatchNorm2d, + ) + # Note: The forward method of this returns a dict: {"feature_map": output}. + self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"}) + + # Transformer (acts as VAE decoder when training with the variational objective). + self.encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs) + self.decoder = TransformerDecoder(num_layers=args.dec_layers, **transformer_common_kwargs) + + # Transformer encoder input projections. The tokens will be structured like + # [latent, robot_state, image_feature_map_pixels]. + self.encoder_img_feat_input_proj = nn.Conv2d( + backbone_model.fc.in_features, self.hidden_dim, kernel_size=1 + ) + self.encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim) + self.encoder_latent_input_proj = nn.Linear(self.latent_dim, self.hidden_dim) + # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image + # feature dimension with a dry run. + self.additional_pos_embed = nn.Embedding( + 2, self.hidden_dim + ) # learned position embedding for proprio and latent + + # Transformer decoder. + # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries). + self.decoder_pos_embed_embed = nn.Embedding(horizon, self.hidden_dim) + # Final action regression head on the output of the transformer's decoder. + self.action_head = nn.Linear(self.hidden_dim, action_dim) + + self._reset_parameters() + + def _reset_parameters(self): + """Xavier-uniform initialization of the transformer parameters as in the original code.""" + for p in chain(self.encoder.parameters(), self.decoder.parameters()): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, robot_state, image, actions=None): + """ + Args: + robot_state: (B, J) batch of robot joint configurations. + image: (B, N, C, H, W) batch of N camera frames. + actions: (B, S, A) batch of actions from the target dataset which must be provided if the + VAE is enabled and the model is in training mode. + """ + if self.use_vae and self.training: + assert ( + actions is not None + ), "actions must be provided when using the variational objective in training mode." + + batch_size, _ = robot_state.shape + + # Prepare the latent for input to the transformer. + if self.use_vae and actions is not None: + # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence]. + cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size) # (B, 1, D) + robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1) # (B, 1, D) + action_embed = self.vae_encoder_action_input_proj(actions) # (B, S, D) + vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1) # (B, S+2, D) + # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case. + # Prepare fixed positional embedding. + pos_embed = self.vae_encoder_pos_enc.clone().detach() # (1, S+2, D) + # Forward pass through VAE encoder and sample the latent with the reparameterization trick. + cls_token_out = self.vae_encoder( + vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2) + )[0] # (B, D) + latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out) + mu = latent_pdf_params[:, : self.latent_dim] + # This is 2log(sigma). Done this way to match the original implementation. + log_sigma_x2 = latent_pdf_params[:, self.latent_dim :] + # Use reparameterization trick to sample from the latent's PDF. + latent_sample = mu + log_sigma_x2.div(2).exp() * torch.randn_like(mu) + else: + # When not using the VAE encoder, we set the latent to be all zeros. + mu = log_sigma_x2 = None + latent_sample = torch.zeros([batch_size, self.latent_dim], dtype=torch.float32).to( + robot_state.device + ) + + # Prepare all other transformer inputs. + # Image observation features and position embeddings. + all_cam_features = [] + all_cam_pos = [] + for cam_id, _ in enumerate(self.camera_names): + cam_features = self.backbone(image[:, cam_id])["feature_map"] + pos = self.backbone_position_embedding(cam_features).to(dtype=cam_features.dtype) + cam_features = self.encoder_img_feat_input_proj(cam_features) # (B, C, h, w) + all_cam_features.append(cam_features) + all_cam_pos.append(pos) + # Concatenate image observation feature maps along the width dimension. + encoder_in = torch.cat(all_cam_features, axis=3) + pos = torch.cat(all_cam_pos, axis=3) + robot_state_embed = self.encoder_robot_state_input_proj(robot_state) + latent_embed = self.encoder_latent_input_proj(latent_sample) + + # TODO(now): Explain all of this madness. + encoder_in = torch.cat( + [ + torch.stack([latent_embed, robot_state_embed], axis=0), + encoder_in.flatten(2).permute(2, 0, 1), + ] + ) + pos_embed = torch.cat( + [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0 + ) + + encoder_out = self.encoder(encoder_in, pos=pos_embed) + decoder_in = torch.zeros( + (self.horizon, batch_size, self.hidden_dim), dtype=pos_embed.dtype, device=pos_embed.device + ) + decoder_out = self.decoder( + decoder_in, + encoder_out, + encoder_pos_embed=pos_embed, + decoder_pos_embed=self.decoder_pos_embed_embed.weight.unsqueeze(1), + ).transpose(0, 1) # back to (B, S, C) + + actions = self.action_head(decoder_out) + return actions, [mu, log_sigma_x2] + + +class TransformerEncoder(nn.Module): + def __init__( + self, + num_layers, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.layers = nn.ModuleList( + [ + TransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + for _ in range(num_layers) + ] + ) + self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity() + + def forward(self, x, pos: Optional[Tensor] = None): + for layer in self.layers: + x = layer(x, pos=pos) + x = self.norm(x) + return x + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def forward(self, x, pos_embed: Optional[Tensor] = None): + skip = x + if self.normalize_before: + x = self.norm1(x) + q = k = x if pos_embed is None else x + pos_embed + x = self.self_attn(q, k, value=x)[0] + x = skip + self.dropout1(x) + if self.normalize_before: + skip = x + x = self.norm2(x) + else: + x = self.norm1(x) + skip = x + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + x = skip + self.dropout2(x) + if not self.normalize_before: + x = self.norm2(x) + return x + + +class TransformerDecoder(nn.Module): + def __init__( + self, + num_layers, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.layers = nn.ModuleList( + [ + TransformerDecoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + for _ in range(num_layers) + ] + ) + self.num_layers = num_layers + self.norm = nn.LayerNorm(d_model) + + def forward( + self, x, encoder_out, decoder_pos_embed: Tensor | None = None, encoder_pos_embed: Tensor | None = None + ): + for layer in self.layers: + x = layer( + x, encoder_out, decoder_pos_embed=decoder_pos_embed, encoder_pos_embed=encoder_pos_embed + ) + if self.norm is not None: + x = self.norm(x) + return x + + +class TransformerDecoderLayer(nn.Module): + def __init__( + self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def maybe_add_pos_embed(self, tensor: Tensor, pos_embed: Tensor | None) -> Tensor: + return tensor if pos_embed is None else tensor + pos_embed + + def forward( + self, + x: Tensor, + encoder_out: Tensor, + decoder_pos_embed: Tensor | None = None, + encoder_pos_embed: Tensor | None = None, + ) -> Tensor: + """ + Args: + x: (Decoder Sequence, Batch, Channel) tensor of input tokens. + encoder_out: (Encoder Sequence, B, C) output features from the last layer of the encoder we are + cross-attending with. + decoder_pos_embed: (ES, 1, C) positional embedding for keys (from the encoder). + encoder_pos_embed: (DS, 1, C) Positional_embedding for the queries (from the decoder). + Returns: + (DS, B, C) tensor of decoder output features. + """ + skip = x + if self.normalize_before: + x = self.norm1(x) + q = k = self.maybe_add_pos_embed(x, decoder_pos_embed) + x = self.self_attn(q, k, value=x)[0] + x = skip + self.dropout1(x) + if self.normalize_before: + skip = x + x = self.norm2(x) + else: + x = self.norm1(x) + skip = x + x = self.multihead_attn( + query=self.maybe_add_pos_embed(x, decoder_pos_embed), + key=self.maybe_add_pos_embed(encoder_out, encoder_pos_embed), + value=encoder_out, + )[0] + x = skip + self.dropout2(x) + if self.normalize_before: + skip = x + x = self.norm3(x) + else: + x = self.norm2(x) + skip = x + x = self.linear2(self.dropout(self.activation(self.linear1(x)))) + x = skip + self.dropout3(x) + if not self.normalize_before: + x = self.norm3(x) + return x + + +class SinusoidalPositionEmbedding2D(nn.Module): + """Sinusoidal positional embeddings similar to what's presented in Attention Is All You Need. + + The variation is that the position indices are normalized in [0, 2π] (not quite: the lower bound is 1/H + for the vertical direction, and 1/W for the horizontal direction. + """ + + def __init__(self, dimension: int): + """ + Args: + dimension: The desired dimension of the embeddings. + """ + super().__init__() + self.dimension = dimension + self._two_pi = 2 * math.pi + self._eps = 1e-6 + # Inverse "common ratio" for the geometric progression in sinusoid frequencies. + self._temperature = 10000 + + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x: A (B, C, H, W) batch of 2D feature map to generate the embeddings for. + Returns: + A (1, C, H, W) batch of corresponding sinusoidal positional embeddings. + """ + not_mask = torch.ones_like(x[0, [0]]) # (1, H, W) + # Note: These are like range(1, H+1) and range(1, W+1) respectively, but in most implementations + # they would be range(0, H) and range(0, W). Keeping it at as to match the original code. + y_range = not_mask.cumsum(1, dtype=torch.float32) + x_range = not_mask.cumsum(2, dtype=torch.float32) + + # "Normalize" the position index such that it ranges in [0, 2π]. + # Note: Adding epsilon on the denominator should not be needed as all values of y_embed and x_range + # are non-zero by construction. This is an artifact of the original code. + y_range = y_range / (y_range[:, -1:, :] + self._eps) * self._two_pi + x_range = x_range / (x_range[:, :, -1:] + self._eps) * self._two_pi + + inverse_frequency = self._temperature ** ( + 2 * (torch.arange(self.dimension, dtype=torch.float32, device=x.device) // 2) / self.dimension + ) + + x_range = x_range.unsqueeze(-1) / inverse_frequency # (1, H, W, 1) + y_range = y_range.unsqueeze(-1) / inverse_frequency # (1, H, W, 1) + + # Note: this stack then flatten operation results in interleaved sine and cosine terms. + # pos_embed_x and pos_embed are (1, H, W, C // 2). + pos_embed_x = torch.stack((x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1).flatten(3) + pos_embed_y = torch.stack((y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1).flatten(3) + pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(0, 3, 1, 2) # (1, C, H, W) + + return pos_embed + + +def _get_activation_fn(activation: str) -> Callable: + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu/glu, not {activation}.") diff --git a/lerobot/common/policies/act/position_encoding.py b/lerobot/common/policies/act/position_encoding.py deleted file mode 100644 index 63bb4840..00000000 --- a/lerobot/common/policies/act/position_encoding.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Various positional encodings for the transformer. -""" - -import math - -import torch -from torch import nn - -from .utils import NestedTensor - - -class PositionEmbeddingSine(nn.Module): - """ - This is a more standard version of the position embedding, very similar to the one - used by the Attention is all you need paper, generalized to work on images. - """ - - def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): - super().__init__() - self.num_pos_feats = num_pos_feats - self.temperature = temperature - self.normalize = normalize - if scale is not None and normalize is False: - raise ValueError("normalize should be True if scale is passed") - if scale is None: - scale = 2 * math.pi - self.scale = scale - - def forward(self, tensor): - x = tensor - # mask = tensor_list.mask - # assert mask is not None - # not_mask = ~mask - - not_mask = torch.ones_like(x[0, [0]]) - y_embed = not_mask.cumsum(1, dtype=torch.float32) - x_embed = not_mask.cumsum(2, dtype=torch.float32) - if self.normalize: - eps = 1e-6 - y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale - x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale - - dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) - dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) - - pos_x = x_embed[:, :, :, None] / dim_t - pos_y = y_embed[:, :, :, None] / dim_t - pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) - pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) - return pos - - -class PositionEmbeddingLearned(nn.Module): - """ - Absolute pos embedding, learned. - """ - - def __init__(self, num_pos_feats=256): - super().__init__() - self.row_embed = nn.Embedding(50, num_pos_feats) - self.col_embed = nn.Embedding(50, num_pos_feats) - self.reset_parameters() - - def reset_parameters(self): - nn.init.uniform_(self.row_embed.weight) - nn.init.uniform_(self.col_embed.weight) - - def forward(self, tensor_list: NestedTensor): - x = tensor_list.tensors - h, w = x.shape[-2:] - i = torch.arange(w, device=x.device) - j = torch.arange(h, device=x.device) - x_emb = self.col_embed(i) - y_emb = self.row_embed(j) - pos = ( - torch.cat( - [ - x_emb.unsqueeze(0).repeat(h, 1, 1), - y_emb.unsqueeze(1).repeat(1, w, 1), - ], - dim=-1, - ) - .permute(2, 0, 1) - .unsqueeze(0) - .repeat(x.shape[0], 1, 1, 1) - ) - return pos - - -def build_position_encoding(args): - n_steps = args.hidden_dim // 2 - if args.position_embedding in ("v2", "sine"): - # TODO find a better way of exposing other arguments - position_embedding = PositionEmbeddingSine(n_steps, normalize=True) - elif args.position_embedding in ("v3", "learned"): - position_embedding = PositionEmbeddingLearned(n_steps) - else: - raise ValueError(f"not supported {args.position_embedding}") - - return position_embedding diff --git a/lerobot/common/policies/act/transformer.py b/lerobot/common/policies/act/transformer.py deleted file mode 100644 index 7e71f3ea..00000000 --- a/lerobot/common/policies/act/transformer.py +++ /dev/null @@ -1,240 +0,0 @@ -""" -TODO(now) -""" - -from typing import Optional - -import torch -import torch.nn.functional as F # noqa: N812 -from torch import Tensor, nn - - -class Transformer(nn.Module): - def __init__( - self, - d_model=512, - nhead=8, - num_encoder_layers=6, - num_decoder_layers=6, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - normalize_before=False, - ): - super().__init__() - self.encoder = TransformerEncoder( - num_encoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before - ) - self.decoder = TransformerDecoder( - num_decoder_layers, d_model, nhead, dim_feedforward, dropout, activation, normalize_before - ) - self.d_model = d_model - self.nhead = nhead - self._init_params() # TODO(now): move to somewhere common - - def _init_params(self): - for p in self.parameters(): - if p.dim() > 1: - nn.init.xavier_uniform_(p) - - def forward(self, x, encoder_pos, decoder_pos): - """ - Args: - x: ((E)ncoder (S)equence, (B)atch, (C)hannels) - decoder_pos: (Decoder Sequence, C) tensor for the decoder's positional embedding. - encoder_pos: (ES, C) tenso - """ - # TODO flatten only when input has H and W - bs = x.shape[1] - - encoder_out = self.encoder(x, pos=encoder_pos) - decoder_in = torch.zeros( - (decoder_pos.shape[0], bs, decoder_pos.shape[2]), - dtype=decoder_pos.dtype, - device=decoder_pos.device, - ) - decoder_out = self.decoder(decoder_in, encoder_out, encoder_pos=encoder_pos, decoder_pos=decoder_pos) - return decoder_out - - -class TransformerEncoder(nn.Module): - def __init__( - self, - num_layers, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - normalize_before=False, - ): - super().__init__() - self.layers = nn.ModuleList( - [ - TransformerEncoderLayer( - d_model, nhead, dim_feedforward, dropout, activation, normalize_before - ) - for _ in range(num_layers) - ] - ) - self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity() - - def forward(self, x, pos: Optional[Tensor] = None): - for layer in self.layers: - x = layer(x, pos=pos) - x = self.norm(x) - return x - - -class TransformerEncoderLayer(nn.Module): - def __init__( - self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False - ): - super().__init__() - self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - - self.activation = _get_activation_fn(activation) - self.normalize_before = normalize_before - - def forward(self, x, pos: Optional[Tensor] = None): - skip = x - if self.normalize_before: - x = self.norm1(x) - q = k = x if pos is None else x + pos - x = self.self_attn(q, k, value=x)[0] - x = skip + self.dropout1(x) - if self.normalize_before: - skip = x - x = self.norm2(x) - else: - x = self.norm1(x) - skip = x - x = self.linear2(self.dropout(self.activation(self.linear1(x)))) - x = skip + self.dropout2(x) - if not self.normalize_before: - x = self.norm2(x) - return x - - -class TransformerDecoder(nn.Module): - def __init__( - self, - num_layers, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - normalize_before=False, - ): - super().__init__() - self.layers = nn.ModuleList( - [ - TransformerDecoderLayer( - d_model, nhead, dim_feedforward, dropout, activation, normalize_before - ) - for _ in range(num_layers) - ] - ) - self.num_layers = num_layers - self.norm = nn.LayerNorm(d_model) - - def forward(self, x, encoder_out, decoder_pos: Tensor | None = None, encoder_pos: Tensor | None = None): - for layer in self.layers: - x = layer(x, encoder_out, decoder_pos=decoder_pos, encoder_pos=encoder_pos) - if self.norm is not None: - x = self.norm(x) - return x - - -class TransformerDecoderLayer(nn.Module): - def __init__( - self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False - ): - super().__init__() - self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.norm3 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - self.dropout3 = nn.Dropout(dropout) - - self.activation = _get_activation_fn(activation) - self.normalize_before = normalize_before - - def maybe_add_pos_embed(self, tensor: Tensor, pos: Tensor | None) -> Tensor: - return tensor if pos is None else tensor + pos - - def forward( - self, - x: Tensor, - encoder_out: Tensor, - decoder_pos: Tensor | None = None, - encoder_pos: Tensor | None = None, - ) -> Tensor: - """ - Args: - x: (Decoder Sequence, Batch, Channel) tensor of input tokens. - encoder_out: (Encoder Sequence, B, C) output features from the last layer of the encoder we are - cross-attending with. - decoder_pos: (ES, 1, C) positional embedding for keys (from the encoder). - encoder_pos: (DS, 1, C) Positional_embedding for the queries (from the decoder). - Returns: - (DS, B, C) tensor of decoder output features. - """ - skip = x - if self.normalize_before: - x = self.norm1(x) - q = k = self.maybe_add_pos_embed(x, decoder_pos) - x = self.self_attn(q, k, value=x)[0] - x = skip + self.dropout1(x) - if self.normalize_before: - skip = x - x = self.norm2(x) - else: - x = self.norm1(x) - skip = x - x = self.multihead_attn( - query=self.maybe_add_pos_embed(x, decoder_pos), - key=self.maybe_add_pos_embed(encoder_out, encoder_pos), - value=encoder_out, - )[0] - x = skip + self.dropout2(x) - if self.normalize_before: - skip = x - x = self.norm3(x) - else: - x = self.norm2(x) - skip = x - x = self.linear2(self.dropout(self.activation(self.linear1(x)))) - x = skip + self.dropout3(x) - if not self.normalize_before: - x = self.norm3(x) - return x - - -def _get_activation_fn(activation): - """Return an activation function given a string""" - if activation == "relu": - return F.relu - if activation == "gelu": - return F.gelu - if activation == "glu": - return F.glu - raise RuntimeError(f"activation should be relu/gelu/glu, not {activation}.") diff --git a/lerobot/common/policies/act/utils.py b/lerobot/common/policies/act/utils.py deleted file mode 100644 index 0d935839..00000000 --- a/lerobot/common/policies/act/utils.py +++ /dev/null @@ -1,478 +0,0 @@ -""" -Misc functions, including distributed helpers. - -Mostly copy-paste from torchvision references. -""" - -import datetime -import os -import pickle -import subprocess -import time -from collections import defaultdict, deque -from typing import List, Optional - -import torch -import torch.distributed as dist - -# needed due to empty tensor bug in pytorch and torchvision 0.5 -import torchvision -from packaging import version -from torch import Tensor - -if version.parse(torchvision.__version__) < version.parse("0.7"): - from torchvision.ops import _new_empty_tensor - from torchvision.ops.misc import _output_size - - -class SmoothedValue: - """Track a series of values and provide access to smoothed values over a - window or the global series average. - """ - - def __init__(self, window_size=20, fmt=None): - if fmt is None: - fmt = "{median:.4f} ({global_avg:.4f})" - self.deque = deque(maxlen=window_size) - self.total = 0.0 - self.count = 0 - self.fmt = fmt - - def update(self, value, n=1): - self.deque.append(value) - self.count += n - self.total += value * n - - def synchronize_between_processes(self): - """ - Warning: does not synchronize the deque! - """ - if not is_dist_avail_and_initialized(): - return - t = torch.tensor([self.count, self.total], dtype=torch.float64, device="cuda") - dist.barrier() - dist.all_reduce(t) - t = t.tolist() - self.count = int(t[0]) - self.total = t[1] - - @property - def median(self): - d = torch.tensor(list(self.deque)) - return d.median().item() - - @property - def avg(self): - d = torch.tensor(list(self.deque), dtype=torch.float32) - return d.mean().item() - - @property - def global_avg(self): - return self.total / self.count - - @property - def max(self): - return max(self.deque) - - @property - def value(self): - return self.deque[-1] - - def __str__(self): - return self.fmt.format( - median=self.median, avg=self.avg, global_avg=self.global_avg, max=self.max, value=self.value - ) - - -def all_gather(data): - """ - Run all_gather on arbitrary picklable data (not necessarily tensors) - Args: - data: any picklable object - Returns: - list[data]: list of data gathered from each rank - """ - world_size = get_world_size() - if world_size == 1: - return [data] - - # serialized to a Tensor - buffer = pickle.dumps(data) - storage = torch.ByteStorage.from_buffer(buffer) - tensor = torch.ByteTensor(storage).to("cuda") - - # obtain Tensor size of each rank - local_size = torch.tensor([tensor.numel()], device="cuda") - size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)] - dist.all_gather(size_list, local_size) - size_list = [int(size.item()) for size in size_list] - max_size = max(size_list) - - # receiving Tensor from all ranks - # we pad the tensor because torch all_gather does not support - # gathering tensors of different shapes - tensor_list = [] - for _ in size_list: - tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda")) - if local_size != max_size: - padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda") - tensor = torch.cat((tensor, padding), dim=0) - dist.all_gather(tensor_list, tensor) - - data_list = [] - for size, tensor in zip(size_list, tensor_list, strict=False): - buffer = tensor.cpu().numpy().tobytes()[:size] - data_list.append(pickle.loads(buffer)) - - return data_list - - -def reduce_dict(input_dict, average=True): - """ - Args: - input_dict (dict): all the values will be reduced - average (bool): whether to do average or sum - Reduce the values in the dictionary from all processes so that all processes - have the averaged results. Returns a dict with the same fields as - input_dict, after reduction. - """ - world_size = get_world_size() - if world_size < 2: - return input_dict - with torch.no_grad(): - names = [] - values = [] - # sort the keys so that they are consistent across processes - for k in sorted(input_dict.keys()): - names.append(k) - values.append(input_dict[k]) - values = torch.stack(values, dim=0) - dist.all_reduce(values) - if average: - values /= world_size - reduced_dict = {k: v for k, v in zip(names, values, strict=False)} # noqa: C416 - return reduced_dict - - -class MetricLogger: - def __init__(self, delimiter="\t"): - self.meters = defaultdict(SmoothedValue) - self.delimiter = delimiter - - def update(self, **kwargs): - for k, v in kwargs.items(): - if isinstance(v, torch.Tensor): - v = v.item() - assert isinstance(v, (float, int)) - self.meters[k].update(v) - - def __getattr__(self, attr): - if attr in self.meters: - return self.meters[attr] - if attr in self.__dict__: - return self.__dict__[attr] - raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, attr)) - - def __str__(self): - loss_str = [] - for name, meter in self.meters.items(): - loss_str.append("{}: {}".format(name, str(meter))) - return self.delimiter.join(loss_str) - - def synchronize_between_processes(self): - for meter in self.meters.values(): - meter.synchronize_between_processes() - - def add_meter(self, name, meter): - self.meters[name] = meter - - def log_every(self, iterable, print_freq, header=None): - if not header: - header = "" - start_time = time.time() - end = time.time() - iter_time = SmoothedValue(fmt="{avg:.4f}") - data_time = SmoothedValue(fmt="{avg:.4f}") - space_fmt = ":" + str(len(str(len(iterable)))) + "d" - if torch.cuda.is_available(): - log_msg = self.delimiter.join( - [ - header, - "[{0" + space_fmt + "}/{1}]", - "eta: {eta}", - "{meters}", - "time: {time}", - "data: {data}", - "max mem: {memory:.0f}", - ] - ) - else: - log_msg = self.delimiter.join( - [ - header, - "[{0" + space_fmt + "}/{1}]", - "eta: {eta}", - "{meters}", - "time: {time}", - "data: {data}", - ] - ) - mega_b = 1024.0 * 1024.0 - for i, obj in enumerate(iterable): - data_time.update(time.time() - end) - yield obj - iter_time.update(time.time() - end) - if i % print_freq == 0 or i == len(iterable) - 1: - eta_seconds = iter_time.global_avg * (len(iterable) - i) - eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) - if torch.cuda.is_available(): - print( - log_msg.format( - i, - len(iterable), - eta=eta_string, - meters=str(self), - time=str(iter_time), - data=str(data_time), - memory=torch.cuda.max_memory_allocated() / mega_b, - ) - ) - else: - print( - log_msg.format( - i, - len(iterable), - eta=eta_string, - meters=str(self), - time=str(iter_time), - data=str(data_time), - ) - ) - end = time.time() - total_time = time.time() - start_time - total_time_str = str(datetime.timedelta(seconds=int(total_time))) - print("{} Total time: {} ({:.4f} s / it)".format(header, total_time_str, total_time / len(iterable))) - - -def get_sha(): - cwd = os.path.dirname(os.path.abspath(__file__)) - - def _run(command): - return subprocess.check_output(command, cwd=cwd).decode("ascii").strip() - - sha = "N/A" - diff = "clean" - branch = "N/A" - try: - sha = _run(["git", "rev-parse", "HEAD"]) - subprocess.check_output(["git", "diff"], cwd=cwd) - diff = _run(["git", "diff-index", "HEAD"]) - diff = "has uncommited changes" if diff else "clean" - branch = _run(["git", "rev-parse", "--abbrev-ref", "HEAD"]) - except Exception: - pass - message = f"sha: {sha}, status: {diff}, branch: {branch}" - return message - - -def collate_fn(batch): - batch = list(zip(*batch, strict=False)) - batch[0] = nested_tensor_from_tensor_list(batch[0]) - return tuple(batch) - - -def _max_by_axis(the_list): - # type: (List[List[int]]) -> List[int] - maxes = the_list[0] - for sublist in the_list[1:]: - for index, item in enumerate(sublist): - maxes[index] = max(maxes[index], item) - return maxes - - -class NestedTensor: - def __init__(self, tensors, mask: Optional[Tensor]): - self.tensors = tensors - self.mask = mask - - def to(self, device): - # type: (Device) -> NestedTensor # noqa - cast_tensor = self.tensors.to(device) - mask = self.mask - if mask is not None: - assert mask is not None - cast_mask = mask.to(device) - else: - cast_mask = None - return NestedTensor(cast_tensor, cast_mask) - - def decompose(self): - return self.tensors, self.mask - - def __repr__(self): - return str(self.tensors) - - -def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): - # TODO make this more general - if tensor_list[0].ndim == 3: - if torchvision._is_tracing(): - # nested_tensor_from_tensor_list() does not export well to ONNX - # call _onnx_nested_tensor_from_tensor_list() instead - return _onnx_nested_tensor_from_tensor_list(tensor_list) - - # TODO make it support different-sized images - max_size = _max_by_axis([list(img.shape) for img in tensor_list]) - # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) - batch_shape = [len(tensor_list)] + max_size - b, c, h, w = batch_shape - dtype = tensor_list[0].dtype - device = tensor_list[0].device - tensor = torch.zeros(batch_shape, dtype=dtype, device=device) - mask = torch.ones((b, h, w), dtype=torch.bool, device=device) - for img, pad_img, m in zip(tensor_list, tensor, mask, strict=False): - pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - m[: img.shape[1], : img.shape[2]] = False - else: - raise ValueError("not supported") - return NestedTensor(tensor, mask) - - -# _onnx_nested_tensor_from_tensor_list() is an implementation of -# nested_tensor_from_tensor_list() that is supported by ONNX tracing. -@torch.jit.unused -def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: - max_size = [] - for i in range(tensor_list[0].dim()): - max_size_i = torch.max(torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)).to( - torch.int64 - ) - max_size.append(max_size_i) - max_size = tuple(max_size) - - # work around for - # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) - # m[: img.shape[1], :img.shape[2]] = False - # which is not yet supported in onnx - padded_imgs = [] - padded_masks = [] - for img in tensor_list: - padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape), strict=False)] - padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) - padded_imgs.append(padded_img) - - m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) - padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) - padded_masks.append(padded_mask.to(torch.bool)) - - tensor = torch.stack(padded_imgs) - mask = torch.stack(padded_masks) - - return NestedTensor(tensor, mask=mask) - - -def setup_for_distributed(is_master): - """ - This function disables printing when not in master process - """ - import builtins as __builtin__ - - builtin_print = __builtin__.print - - def print(*args, **kwargs): - force = kwargs.pop("force", False) - if is_master or force: - builtin_print(*args, **kwargs) - - __builtin__.print = print - - -def is_dist_avail_and_initialized(): - if not dist.is_available(): - return False - if not dist.is_initialized(): - return False - return True - - -def get_world_size(): - if not is_dist_avail_and_initialized(): - return 1 - return dist.get_world_size() - - -def get_rank(): - if not is_dist_avail_and_initialized(): - return 0 - return dist.get_rank() - - -def is_main_process(): - return get_rank() == 0 - - -def save_on_master(*args, **kwargs): - if is_main_process(): - torch.save(*args, **kwargs) - - -def init_distributed_mode(args): - if "RANK" in os.environ and "WORLD_SIZE" in os.environ: - args.rank = int(os.environ["RANK"]) - args.world_size = int(os.environ["WORLD_SIZE"]) - args.gpu = int(os.environ["LOCAL_RANK"]) - elif "SLURM_PROCID" in os.environ: - args.rank = int(os.environ["SLURM_PROCID"]) - args.gpu = args.rank % torch.cuda.device_count() - else: - print("Not using distributed mode") - args.distributed = False - return - - args.distributed = True - - torch.cuda.set_device(args.gpu) - args.dist_backend = "nccl" - print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True) - torch.distributed.init_process_group( - backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank - ) - torch.distributed.barrier() - setup_for_distributed(args.rank == 0) - - -@torch.no_grad() -def accuracy(output, target, topk=(1,)): - """Computes the precision@k for the specified values of k""" - if target.numel() == 0: - return [torch.zeros([], device=output.device)] - maxk = max(topk) - batch_size = target.size(0) - - _, pred = output.topk(maxk, 1, True, True) - pred = pred.t() - correct = pred.eq(target.view(1, -1).expand_as(pred)) - - res = [] - for k in topk: - correct_k = correct[:k].view(-1).float().sum(0) - res.append(correct_k.mul_(100.0 / batch_size)) - return res - - -def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): - # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor - """ - Equivalent to nn.functional.interpolate, but with support for empty batch sizes. - This will eventually be supported natively by PyTorch, and this - class can go away. - """ - if version.parse(torchvision.__version__) < version.parse("0.7"): - if input.numel() > 0: - return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners) - - output_shape = _output_size(2, input, size, scale_factor) - output_shape = list(input.shape[:-2]) + list(output_shape) - return _new_empty_tensor(input, output_shape) - else: - return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index 1086b595..22b6cd6f 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -33,11 +33,10 @@ policy: nheads: 8 #camera_names: [top, front_close, left_pillar, right_pillar] camera_names: [top] - position_embedding: sine - masks: false dilation: false dropout: 0.1 pre_norm: false + activation: relu vae: true diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py index d0c0c3e7..c8f83422 100644 --- a/scripts/convert_act_weights.py +++ b/scripts/convert_act_weights.py @@ -11,6 +11,19 @@ policy = make_policy(cfg) state_dict = torch.load("/home/alexander/Projects/act/outputs/sim_transfer_cube_human_vae/policy_last.ckpt") +# Remove keys based on what they start with. + +start_removals = [ + # There is a bug that means the pretrained model doesn't even use the final decoder layers. + *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)], + "model.is_pad_head.", +] + +for to_remove in start_removals: + for k in list(state_dict.keys()): + if k.startswith(to_remove): + del state_dict[k] + # Replace keys based on what they start with. @@ -26,6 +39,9 @@ start_replacements = [ ("model.input_proj.", "model.encoder_img_feat_input_proj."), ("model.input_proj_robot_state", "model.encoder_robot_state_input_proj"), ("model.latent_out_proj.", "model.encoder_latent_input_proj."), + ("model.transformer.encoder.", "model.encoder."), + ("model.transformer.decoder.", "model.decoder."), + ("model.backbones.0.0.body.", "model.backbone."), ] for to_replace, replace_with in start_replacements: @@ -35,18 +51,6 @@ for to_replace, replace_with in start_replacements: state_dict[k_] = state_dict[k] del state_dict[k] -# Remove keys based on what they start with. - -start_removals = [ - # There is a bug that means the pretrained model doesn't even use the final decoder layers. - *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)], - "model.is_pad_head.", -] - -for to_remove in start_removals: - for k in list(state_dict.keys()): - if k.startswith(to_remove): - del state_dict[k] missing_keys, unexpected_keys = policy.load_state_dict(state_dict, strict=False) From edb125b35116a044574f7d406de19ee368d63583 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 5 Apr 2024 11:03:28 +0100 Subject: [PATCH 06/25] backup wip --- lerobot/common/policies/act/policy.py | 390 ++++++++++++-------------- lerobot/configs/policy/act.yaml | 9 +- scripts/convert_act_weights.py | 2 + 3 files changed, 188 insertions(+), 213 deletions(-) diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index 906ea0cd..5071c09a 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -1,13 +1,13 @@ """Action Chunking Transformer Policy As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705). +The majority of changes here involve removing unused code, unifying naming, and adding helpful comments. """ -import logging import math import time from itertools import chain -from typing import Callable, Optional +from typing import Callable import einops import numpy as np @@ -26,40 +26,56 @@ from lerobot.common.utils import get_safe_torch_device class ActionChunkingTransformerPolicy(AbstractPolicy): """ Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost - Hardware (https://arxiv.org/abs/2304.13705). + Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act) + + Note: In this code we use the terms `vae_encoder`, 'encoder', `decoder`. The meanings are as follows. + - The `vae_encoder` is, as per the literature around variational auto-encoders (VAE), the part of the + model that encodes the target data (a sequence of actions), and the condition (the robot + joint-space). + - A transformer with an `encoder` (not the VAE encoder) and `decoder` (not the VAE decoder) with + cross-attention is used as the VAE decoder. For these terms, we drop the `vae_` prefix because we + have an option to train this model without the variational objective (in which case we drop the + `vae_encoder` altogether, and nothing about this model has anything to do with a VAE). + + Transformer + Used alone for inference + (acts as VAE decoder + during training) + ┌───────────────────────┐ + │ Outputs │ + │ ▲ │ + │ ┌─────►┌───────┐ │ + ┌──────┐ │ │ │Transf.│ │ + │ │ │ ├─────►│decoder│ │ + ┌────┴────┐ │ │ │ │ │ │ + │ │ │ │ ┌───┴───┬─►│ │ │ + │ VAE │ │ │ │ │ └───────┘ │ + │ encoder │ │ │ │Transf.│ │ + │ │ │ │ │encoder│ │ + └───▲─────┘ │ │ │ │ │ + │ │ │ └───▲───┘ │ + │ │ │ │ │ + inputs └─────┼─────┘ │ + │ │ + └───────────────────────┘ """ name = "act" def __init__(self, cfg, device, n_action_steps=1): """ - Args: - vae: Whether to use the variational objective. TODO(now): Give more details. - temporal_agg: Whether to do temporal aggregation. For each timestep during rollout, the action - returned as an exponential moving average of previously generated actions for that timestep. - n_obs_steps: Number of time steps worth of observation to use as input. - horizon: The number of actions to generate in one forward pass. - kl_weight: Weight for KL divergence. Defaults to None. Only applicable when using the variational - objective. - batch_size: Training batch size. - grad_clip_norm: Optionally clip the gradients to have this value as the norm at most. Defaults to - None meaning gradient clipping is not applied. - lr: Learning rate. + TODO(alexander-soare): Add documentation for all parameters. """ super().__init__(n_action_steps) self.cfg = cfg self.n_action_steps = n_action_steps self.device = get_safe_torch_device(device) - self.model = ActionChunkingTransformer( - cfg, - state_dim=cfg.state_dim, - action_dim=cfg.action_dim, - horizon=cfg.horizon, - camera_names=cfg.camera_names, - use_vae=cfg.vae, - ) + self.model = _ActionChunkingTransformer(cfg) + self._create_optimizer() + self.to(self.device) + def _create_optimizer(self): optimizer_params_dicts = [ { "params": [ @@ -74,14 +90,12 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): for n, p in self.model.named_parameters() if n.startswith("backbone") and p.requires_grad ], - "lr": cfg.lr_backbone, + "lr": self.cfg.lr_backbone, }, ] - self.optimizer = torch.optim.AdamW(optimizer_params_dicts, lr=cfg.lr, weight_decay=cfg.weight_decay) - - self.kl_weight = self.cfg.kl_weight - logging.info(f"KL Weight {self.kl_weight}") - self.to(self.device) + self.optimizer = torch.optim.AdamW( + optimizer_params_dicts, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay + ) def update(self, replay_buffer, step): del step @@ -137,7 +151,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): batch = process_batch(batch, self.cfg.horizon, num_slices) data_s = time.time() - start_time - print(data_s) loss = self.compute_loss(batch) loss.backward() @@ -192,16 +205,6 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): "image": observation["image", "top"], "agent_pos": observation["state"], } - # qpos = obs_dict["agent_pos"] - # img = obs_dict["image"] - # qpos_ = torch.load('/tmp/qpos.pth') - # img_ = torch.load('/tmp/curr_image.pth') - # out_ = torch.load('/tmp/out.pth') - # import cv2, numpy as np - # cv2.imwrite("ours.png", (obs_dict["image"][0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)) - # cv2.imwrite("theirs.png", (img_[0, 0].permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8)) - # out = self._forward(qpos_, img_) - # breakpoint() action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"]) if self.cfg.temporal_agg: @@ -236,14 +239,14 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): loss_dict = {} loss_dict["l1"] = l1 - if self.cfg.vae: + if self.cfg.use_vae: # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for # each dimension independently, we sum over the latent dimension to get the total # KL-divergence per batch element, then take the mean over the batch. # (See App. B of https://arxiv.org/abs/1312.6114 for more details). mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean() loss_dict["kl"] = mean_kld - loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.kl_weight + loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight else: loss_dict["loss"] = loss_dict["l1"] return loss_dict @@ -252,135 +255,74 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): return action -def create_sinusoidal_position_embedding(n_position, d_hid): - def get_position_angle_vec(position): - return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)] - - sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(n_position)]) - sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i - sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 - - return torch.FloatTensor(sinusoid_table).unsqueeze(0) - - # TODO(alexander-soare) move all this code into the policy when we have the policy API established. -class ActionChunkingTransformer(nn.Module): - """ - Action Chunking Transformer as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware - (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act) - - Note: In this code we use the terms `vae_encoder`, 'encoder', `decoder`. The meanings are as follows. - - The `vae_encoder` is, as per the literature around variational auto-encoders (VAE), the part of the - model that encodes the target data (a sequence of actions), and the condition (the robot - joint-space). - - A transformer with an `encoder` (not the VAE encoder) and `decoder` (not the VAE decoder) with - cross-attention is used as the VAE decoder. For these terms, we drop the `vae_` prefix because we - have an option to train this model without the variational objective (in which case we drop the - `vae_encoder` altogether, and nothing about this model has anything to do with a VAE). - - Transformer - Used alone for inference - (acts as VAE decoder - during training) - ┌───────────────────────┐ - │ Outputs │ - │ ▲ │ - │ ┌─────►┌───────┐ │ - ┌──────┐ │ │ │Transf.│ │ - │ │ │ ├─────►│decoder│ │ - ┌────┴────┐ │ │ │ │ │ │ - │ │ │ │ ┌───┴───┬─►│ │ │ - │ VAE │ │ │ │ │ └───────┘ │ - │ encoder │ │ │ │Transf.│ │ - │ │ │ │ │encoder│ │ - └───▲─────┘ │ │ │ │ │ - │ │ │ └───▲───┘ │ - │ │ │ │ │ - inputs └─────┼─────┘ │ - │ │ - └───────────────────────┘ - """ - - def __init__(self, args, state_dim, action_dim, horizon, camera_names, use_vae): - """Initializes the model. - Parameters: - state_dim: robot state dimension of the environment - horizon: number of object queries, ie detection slot. This is the maximal number of objects - DETR can detect in a single image. For COCO, we recommend 100 queries. - - Args: - state_dim: Robot positional state dimension. - action_dim: Action dimension. - horizon: The number of actions to generate in one forward pass. - use_vae: Whether to use the variational objective. TODO(now): Give more details. - """ +class _ActionChunkingTransformer(nn.Module): + def __init__(self, cfg): super().__init__() - self.camera_names = camera_names - self.use_vae = use_vae - self.horizon = horizon - self.hidden_dim = args.hidden_dim + self.camera_names = cfg.camera_names + self.use_vae = cfg.use_vae + self.horizon = cfg.horizon + self.d_model = cfg.d_model transformer_common_kwargs = dict( # noqa: C408 - d_model=self.hidden_dim, - nhead=args.nheads, - dim_feedforward=args.dim_feedforward, - dropout=args.dropout, - activation=args.activation, - normalize_before=args.pre_norm, + d_model=self.d_model, + num_heads=cfg.num_heads, + dim_feedforward=cfg.dim_feedforward, + dropout=cfg.dropout, + activation=cfg.activation, + normalize_before=cfg.pre_norm, ) # BERT style VAE encoder with input [cls, *joint_space_configuration, *action_sequence]. # The cls token forms parameters of the latent's distribution (like this [*means, *log_variances]). - if use_vae: - # TODO(now): args.enc_layers shouldn't be shared with the transformer decoder - self.vae_encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs) - self.cls_embed = nn.Embedding(1, self.hidden_dim) + if self.use_vae: + self.vae_encoder = _TransformerEncoder(num_layers=cfg.vae_enc_layers, **transformer_common_kwargs) + self.vae_encoder_cls_embed = nn.Embedding(1, self.d_model) # Projection layer for joint-space configuration to hidden dimension. - self.vae_encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim) + self.vae_encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, self.d_model) # Projection layer for action (joint-space target) to hidden dimension. - self.vae_encoder_action_input_proj = nn.Linear(state_dim, self.hidden_dim) - # Final size of latent z. TODO(now): Add to hyperparams. - self.latent_dim = 32 + self.vae_encoder_action_input_proj = nn.Linear(cfg.state_dim, self.d_model) + self.latent_dim = cfg.latent_dim # Projection layer from the VAE encoder's output to the latent distribution's parameter space. - self.vae_encoder_latent_output_proj = nn.Linear(self.hidden_dim, self.latent_dim * 2) - # Fixed sinusoidal positional embedding the whole input to the VAE encoder. + self.vae_encoder_latent_output_proj = nn.Linear(self.d_model, self.latent_dim * 2) + # Fixed sinusoidal positional embedding the whole input to the VAE encoder. Unsqueeze for batch + # dimension. self.register_buffer( - "vae_encoder_pos_enc", create_sinusoidal_position_embedding(1 + 1 + horizon, self.hidden_dim) + "vae_encoder_pos_enc", + _create_sinusoidal_position_embedding(1 + 1 + self.horizon, self.d_model).unsqueeze(0), ) # Backbone for image feature extraction. - self.backbone_position_embedding = SinusoidalPositionEmbedding2D(self.hidden_dim // 2) - backbone_model = getattr(torchvision.models, args.backbone)( - replace_stride_with_dilation=[False, False, args.dilation], - pretrained=True, # TODO(now): Add pretrained option + backbone_model = getattr(torchvision.models, cfg.backbone)( + replace_stride_with_dilation=[False, False, cfg.dilation], + pretrained=cfg.pretrained_backbone, norm_layer=FrozenBatchNorm2d, ) # Note: The forward method of this returns a dict: {"feature_map": output}. self.backbone = IntermediateLayerGetter(backbone_model, return_layers={"layer4": "feature_map"}) # Transformer (acts as VAE decoder when training with the variational objective). - self.encoder = TransformerEncoder(num_layers=args.enc_layers, **transformer_common_kwargs) - self.decoder = TransformerDecoder(num_layers=args.dec_layers, **transformer_common_kwargs) + self.encoder = _TransformerEncoder(num_layers=cfg.enc_layers, **transformer_common_kwargs) + self.decoder = _TransformerDecoder(num_layers=cfg.dec_layers, **transformer_common_kwargs) # Transformer encoder input projections. The tokens will be structured like # [latent, robot_state, image_feature_map_pixels]. + self.encoder_robot_state_input_proj = nn.Linear(cfg.state_dim, self.d_model) + self.encoder_latent_input_proj = nn.Linear(self.latent_dim, self.d_model) self.encoder_img_feat_input_proj = nn.Conv2d( - backbone_model.fc.in_features, self.hidden_dim, kernel_size=1 + backbone_model.fc.in_features, self.d_model, kernel_size=1 ) - self.encoder_robot_state_input_proj = nn.Linear(state_dim, self.hidden_dim) - self.encoder_latent_input_proj = nn.Linear(self.latent_dim, self.hidden_dim) - # TODO(now): Fix this nonsense. One positional embedding is needed. We should extract the image - # feature dimension with a dry run. - self.additional_pos_embed = nn.Embedding( - 2, self.hidden_dim - ) # learned position embedding for proprio and latent + # Transformer encoder positional embeddings. + self.encoder_robot_and_latent_pos_embed = nn.Embedding(2, self.d_model) + self.encoder_cam_feat_pos_embed = _SinusoidalPositionEmbedding2D(self.d_model // 2) # Transformer decoder. # Learnable positional embedding for the transformer's decoder (in the style of DETR object queries). - self.decoder_pos_embed_embed = nn.Embedding(horizon, self.hidden_dim) + self.decoder_pos_embed = nn.Embedding(self.horizon, self.d_model) + # Final action regression head on the output of the transformer's decoder. - self.action_head = nn.Linear(self.hidden_dim, action_dim) + self.action_head = nn.Linear(self.d_model, cfg.action_dim) self._reset_parameters() @@ -390,7 +332,7 @@ class ActionChunkingTransformer(nn.Module): if p.dim() > 1: nn.init.xavier_uniform_(p) - def forward(self, robot_state, image, actions=None): + def forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None): """ Args: robot_state: (B, J) batch of robot joint configurations. @@ -405,10 +347,12 @@ class ActionChunkingTransformer(nn.Module): batch_size, _ = robot_state.shape - # Prepare the latent for input to the transformer. + # Prepare the latent for input to the transformer encoder. if self.use_vae and actions is not None: # Prepare the input to the VAE encoder: [cls, *joint_space_configuration, *action_sequence]. - cls_embed = einops.repeat(self.cls_embed.weight, "1 d -> b 1 d", b=batch_size) # (B, 1, D) + cls_embed = einops.repeat( + self.vae_encoder_cls_embed.weight, "1 d -> b 1 d", b=batch_size + ) # (B, 1, D) robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1) # (B, 1, D) action_embed = self.vae_encoder_action_input_proj(actions) # (B, S, D) vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1) # (B, S+2, D) @@ -417,7 +361,7 @@ class ActionChunkingTransformer(nn.Module): pos_embed = self.vae_encoder_pos_enc.clone().detach() # (1, S+2, D) # Forward pass through VAE encoder and sample the latent with the reparameterization trick. cls_token_out = self.vae_encoder( - vae_encoder_input.permute(1, 0, 2), pos=pos_embed.permute(1, 0, 2) + vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2) )[0] # (B, D) latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out) mu = latent_pdf_params[:, : self.latent_dim] @@ -432,23 +376,25 @@ class ActionChunkingTransformer(nn.Module): robot_state.device ) - # Prepare all other transformer inputs. - # Image observation features and position embeddings. + # Prepare all other transformer encoder inputs. + # Camera observation features and positional embeddings. all_cam_features = [] - all_cam_pos = [] + all_cam_pos_embeds = [] for cam_id, _ in enumerate(self.camera_names): cam_features = self.backbone(image[:, cam_id])["feature_map"] - pos = self.backbone_position_embedding(cam_features).to(dtype=cam_features.dtype) + cam_pos_embed = self.encoder_cam_feat_pos_embed(cam_features).to(dtype=cam_features.dtype) cam_features = self.encoder_img_feat_input_proj(cam_features) # (B, C, h, w) all_cam_features.append(cam_features) - all_cam_pos.append(pos) - # Concatenate image observation feature maps along the width dimension. + all_cam_pos_embeds.append(cam_pos_embed) + # Concatenate camera observation feature maps and positional embeddings along the width dimension. encoder_in = torch.cat(all_cam_features, axis=3) - pos = torch.cat(all_cam_pos, axis=3) + cam_pos_embed = torch.cat(all_cam_pos_embeds, axis=3) + + # Get positional embeddings for robot state and latent. robot_state_embed = self.encoder_robot_state_input_proj(robot_state) latent_embed = self.encoder_latent_input_proj(latent_sample) - # TODO(now): Explain all of this madness. + # Stack encoder input and positional embeddings moving to (S, B, C). encoder_in = torch.cat( [ torch.stack([latent_embed, robot_state_embed], axis=0), @@ -456,60 +402,68 @@ class ActionChunkingTransformer(nn.Module): ] ) pos_embed = torch.cat( - [self.additional_pos_embed.weight.unsqueeze(1), pos.flatten(2).permute(2, 0, 1)], axis=0 + [ + self.encoder_robot_and_latent_pos_embed.weight.unsqueeze(1), + cam_pos_embed.flatten(2).permute(2, 0, 1), + ], + axis=0, ) - encoder_out = self.encoder(encoder_in, pos=pos_embed) + # Forward pass through the transformer modules. + encoder_out = self.encoder(encoder_in, pos_embed=pos_embed) decoder_in = torch.zeros( - (self.horizon, batch_size, self.hidden_dim), dtype=pos_embed.dtype, device=pos_embed.device + (self.horizon, batch_size, self.d_model), dtype=pos_embed.dtype, device=pos_embed.device ) decoder_out = self.decoder( decoder_in, encoder_out, encoder_pos_embed=pos_embed, - decoder_pos_embed=self.decoder_pos_embed_embed.weight.unsqueeze(1), - ).transpose(0, 1) # back to (B, S, C) + decoder_pos_embed=self.decoder_pos_embed.weight.unsqueeze(1), + ) + + # Move back to (B, S, C). + decoder_out = decoder_out.transpose(0, 1) actions = self.action_head(decoder_out) + return actions, [mu, log_sigma_x2] -class TransformerEncoder(nn.Module): - def __init__( - self, - num_layers, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - normalize_before=False, - ): +class _TransformerEncoder(nn.Module): + """Convenience module for running multiple encoder layers, maybe followed by normalization.""" + + def __init__(self, num_layers: int, **encoder_layer_kwargs: dict): super().__init__() self.layers = nn.ModuleList( - [ - TransformerEncoderLayer( - d_model, nhead, dim_feedforward, dropout, activation, normalize_before - ) - for _ in range(num_layers) - ] + [_TransformerEncoderLayer(**encoder_layer_kwargs) for _ in range(num_layers)] + ) + self.norm = ( + nn.LayerNorm(encoder_layer_kwargs["d_model"]) + if encoder_layer_kwargs["normalize_before"] + else nn.Identity() ) - self.norm = nn.LayerNorm(d_model) if normalize_before else nn.Identity() - def forward(self, x, pos: Optional[Tensor] = None): + def forward(self, x: Tensor, pos_embed: Tensor | None = None) -> Tensor: for layer in self.layers: - x = layer(x, pos=pos) + x = layer(x, pos_embed=pos_embed) x = self.norm(x) return x -class TransformerEncoderLayer(nn.Module): +class _TransformerEncoderLayer(nn.Module): def __init__( - self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False + self, + d_model: int, + num_heads: int, + dim_feedforward: int, + dropout: float, + activation: str, + normalize_before: bool, ): super().__init__() - self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - # Implementation of Feedforward model + self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout) + + # Feed forward layers. self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) @@ -522,7 +476,7 @@ class TransformerEncoderLayer(nn.Module): self.activation = _get_activation_fn(activation) self.normalize_before = normalize_before - def forward(self, x, pos_embed: Optional[Tensor] = None): + def forward(self, x, pos_embed: Tensor | None = None) -> Tensor: skip = x if self.normalize_before: x = self.norm1(x) @@ -542,32 +496,23 @@ class TransformerEncoderLayer(nn.Module): return x -class TransformerDecoder(nn.Module): - def __init__( - self, - num_layers, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - normalize_before=False, - ): +class _TransformerDecoder(nn.Module): + def __init__(self, num_layers: int, **decoder_layer_kwargs): + """Convenience module for running multiple decoder layers followed by normalization.""" super().__init__() self.layers = nn.ModuleList( - [ - TransformerDecoderLayer( - d_model, nhead, dim_feedforward, dropout, activation, normalize_before - ) - for _ in range(num_layers) - ] + [_TransformerDecoderLayer(**decoder_layer_kwargs) for _ in range(num_layers)] ) self.num_layers = num_layers - self.norm = nn.LayerNorm(d_model) + self.norm = nn.LayerNorm(decoder_layer_kwargs["d_model"]) def forward( - self, x, encoder_out, decoder_pos_embed: Tensor | None = None, encoder_pos_embed: Tensor | None = None - ): + self, + x: Tensor, + encoder_out: Tensor, + decoder_pos_embed: Tensor | None = None, + encoder_pos_embed: Tensor | None = None, + ) -> Tensor: for layer in self.layers: x = layer( x, encoder_out, decoder_pos_embed=decoder_pos_embed, encoder_pos_embed=encoder_pos_embed @@ -577,14 +522,21 @@ class TransformerDecoder(nn.Module): return x -class TransformerDecoderLayer(nn.Module): +class _TransformerDecoderLayer(nn.Module): def __init__( - self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu", normalize_before=False + self, + d_model: int, + num_heads: int, + dim_feedforward: int, + dropout: float, + activation: str, + normalize_before: bool, ): super().__init__() - self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) - # Implementation of Feedforward model + self.self_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention(d_model, num_heads, dropout=dropout) + + # Feed forward layers. self.linear1 = nn.Linear(d_model, dim_feedforward) self.dropout = nn.Dropout(dropout) self.linear2 = nn.Linear(dim_feedforward, d_model) @@ -650,8 +602,26 @@ class TransformerDecoderLayer(nn.Module): return x -class SinusoidalPositionEmbedding2D(nn.Module): - """Sinusoidal positional embeddings similar to what's presented in Attention Is All You Need. +def _create_sinusoidal_position_embedding(num_positions: int, dimension: int) -> Tensor: + """1D sinusoidal positional embeddings as in Attention is All You Need. + + Args: + num_positions: Number of token positions required. + Returns: (num_positions, dimension) position embeddings (the first dimension is the batch dimension). + + """ + + def get_position_angle_vec(position): + return [position / np.power(10000, 2 * (hid_j // 2) / dimension) for hid_j in range(dimension)] + + sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(num_positions)]) + sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i + sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 + return torch.from_numpy(sinusoid_table).float() + + +class _SinusoidalPositionEmbedding2D(nn.Module): + """2D sinusoidal positional embeddings similar to what's presented in Attention Is All You Need. The variation is that the position indices are normalized in [0, 2π] (not quite: the lower bound is 1/H for the vertical direction, and 1/W for the horizontal direction. @@ -705,7 +675,7 @@ class SinusoidalPositionEmbedding2D(nn.Module): def _get_activation_fn(activation: str) -> Callable: - """Return an activation function given a string""" + """Return an activation function given a string.""" if activation == "relu": return F.relu if activation == "gelu": diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index 22b6cd6f..3551768c 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -21,24 +21,27 @@ policy: lr: 1e-5 lr_backbone: 1e-5 + pretrained_backbone: true weight_decay: 1e-4 grad_clip_norm: 10 backbone: resnet18 horizon: ${horizon} # chunk_size kl_weight: 10 - hidden_dim: 512 + d_model: 512 dim_feedforward: 3200 + vae_enc_layers: 4 enc_layers: 4 dec_layers: 1 - nheads: 8 + num_heads: 8 #camera_names: [top, front_close, left_pillar, right_pillar] camera_names: [top] dilation: false dropout: 0.1 pre_norm: false activation: relu + latent_dim: 32 - vae: true + use_vae: true batch_size: 8 diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py index c8f83422..b1492009 100644 --- a/scripts/convert_act_weights.py +++ b/scripts/convert_act_weights.py @@ -42,6 +42,8 @@ start_replacements = [ ("model.transformer.encoder.", "model.encoder."), ("model.transformer.decoder.", "model.decoder."), ("model.backbones.0.0.body.", "model.backbone."), + ("model.additional_pos_embed.weight", "model.encoder_robot_and_latent_pos_embed.weight"), + ("model.cls_embed.weight", "model.vae_encoder_cls_embed.weight"), ] for to_replace, replace_with in start_replacements: From 9c28ac8aa424d5e3b51004883e88cd35954329f4 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 5 Apr 2024 15:25:11 +0100 Subject: [PATCH 07/25] re-add pre-commit check --- .pre-commit-config.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index da78b677..765b678a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,3 +23,11 @@ repos: - id: ruff args: [--fix] - id: ruff-format + - repo: https://github.com/python-poetry/poetry + rev: 1.8.0 + hooks: + - id: poetry-check + - id: poetry-lock + args: + - "--check" + - "--no-update" From 1e71196fe3d45ab973d0f612f6b9aa3800af40fb Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 5 Apr 2024 17:38:29 +0100 Subject: [PATCH 08/25] backup wip --- lerobot/common/datasets/aloha.py | 8 +- lerobot/common/datasets/factory.py | 179 +++++----- lerobot/common/policies/act/policy.py | 362 +++++++++----------- lerobot/common/policies/diffusion/policy.py | 1 - lerobot/scripts/train.py | 4 +- poetry.lock | 49 ++- pyproject.toml | 1 + 7 files changed, 306 insertions(+), 298 deletions(-) diff --git a/lerobot/common/datasets/aloha.py b/lerobot/common/datasets/aloha.py index 102de08e..4c0795dd 100644 --- a/lerobot/common/datasets/aloha.py +++ b/lerobot/common/datasets/aloha.py @@ -158,7 +158,7 @@ class AlohaDataset(torch.utils.data.Dataset): self.data_ids_per_episode = {} ep_dicts = [] - logging.info("Initialize and feed offline buffer") + frame_idx = 0 for ep_id in tqdm.tqdm(range(NUM_EPISODES[self.dataset_id])): ep_path = raw_dir / f"episode_{ep_id}.hdf5" with h5py.File(ep_path, "r") as ep: @@ -190,8 +190,14 @@ class AlohaDataset(torch.utils.data.Dataset): ep_dict[f"observation.images.{cam}"] = image[:-1] # ep_dict[f"next.observation.images.{cam}"] = image[1:] + assert isinstance(ep_id, int) + self.data_ids_per_episode[ep_id] = torch.arange(frame_idx, frame_idx + num_frames, 1) + assert len(self.data_ids_per_episode[ep_id]) == num_frames + ep_dicts.append(ep_dict) + frame_idx += num_frames + self.data_dict = {} keys = ep_dicts[0].keys() diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py index 49170098..0217583a 100644 --- a/lerobot/common/datasets/factory.py +++ b/lerobot/common/datasets/factory.py @@ -59,96 +59,95 @@ def make_dataset( transform=Prod(in_keys=clsfunc.image_keys, prod=1 / 255.0), ) stats = compute_or_load_stats(stats_dataset) - # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max" - # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human. - # (Pdb) stats['observation']['state']['mean'] - # tensor([-0.0071, -0.6293, 1.0351, -0.0517, -0.4642, -0.0754, 0.4751, -0.0373, - # -0.3324, 0.9034, -0.2258, -0.3127, -0.2412, 0.6866]) - stats["observation", "state", "mean"] = torch.tensor( - [ - -0.00740268, - -0.63187766, - 1.0356655, - -0.05027218, - -0.46199223, - -0.07467502, - 0.47467607, - -0.03615446, - -0.33203387, - 0.9038929, - -0.22060776, - -0.31011587, - -0.23484458, - 0.6842416, - ] - ) - # (Pdb) stats['observation']['state']['std'] - # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494, - # 0.0326, 0.0476, 0.0535, 0.0956, 0.0513]) - stats["observation", "state", "std"] = torch.tensor( - [ - 0.01219023, - 0.2975381, - 0.16728032, - 0.04733803, - 0.1486037, - 0.08788499, - 0.31752336, - 0.1049916, - 0.27933604, - 0.18094037, - 0.26604933, - 0.30466506, - 0.5298686, - 0.25505227, - ] - ) - # (Pdb) stats['action']['mean'] - # tensor([-0.0075, -0.6346, 1.0353, -0.0465, -0.4686, -0.0738, 0.3723, -0.0396, - # -0.3184, 0.8991, -0.2065, -0.3182, -0.2338, 0.5593]) - stats["action"]["mean"] = torch.tensor( - [ - -0.00756444, - -0.6281845, - 1.0312834, - -0.04664314, - -0.47211358, - -0.074527, - 0.37389806, - -0.03718753, - -0.3261143, - 0.8997205, - -0.21371077, - -0.31840396, - -0.23360962, - 0.551947, - ] - ) - # (Pdb) stats['action']['std'] - # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510, - # 0.0328, 0.0478, 0.0531, 0.0945, 0.0794]) - stats["action"]["std"] = torch.tensor( - [ - 0.01252818, - 0.2957442, - 0.16701928, - 0.04584508, - 0.14833844, - 0.08763024, - 0.30665937, - 0.10600077, - 0.27572668, - 0.1805853, - 0.26304692, - 0.30708534, - 0.5305411, - 0.38381037, - ] - ) - transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode)) # noqa: F821 + # # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human. + # # (Pdb) stats['observation']['state']['mean'] + # # tensor([-0.0071, -0.6293, 1.0351, -0.0517, -0.4642, -0.0754, 0.4751, -0.0373, + # # -0.3324, 0.9034, -0.2258, -0.3127, -0.2412, 0.6866]) + # stats["observation", "state", "mean"] = torch.tensor( + # [ + # -0.00740268, + # -0.63187766, + # 1.0356655, + # -0.05027218, + # -0.46199223, + # -0.07467502, + # 0.47467607, + # -0.03615446, + # -0.33203387, + # 0.9038929, + # -0.22060776, + # -0.31011587, + # -0.23484458, + # 0.6842416, + # ] + # ) + # # (Pdb) stats['observation']['state']['std'] + # # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494, + # # 0.0326, 0.0476, 0.0535, 0.0956, 0.0513]) + # stats["observation", "state", "std"] = torch.tensor( + # [ + # 0.01219023, + # 0.2975381, + # 0.16728032, + # 0.04733803, + # 0.1486037, + # 0.08788499, + # 0.31752336, + # 0.1049916, + # 0.27933604, + # 0.18094037, + # 0.26604933, + # 0.30466506, + # 0.5298686, + # 0.25505227, + # ] + # ) + # # (Pdb) stats['action']['mean'] + # # tensor([-0.0075, -0.6346, 1.0353, -0.0465, -0.4686, -0.0738, 0.3723, -0.0396, + # # -0.3184, 0.8991, -0.2065, -0.3182, -0.2338, 0.5593]) + # stats["action"]["mean"] = torch.tensor( + # [ + # -0.00756444, + # -0.6281845, + # 1.0312834, + # -0.04664314, + # -0.47211358, + # -0.074527, + # 0.37389806, + # -0.03718753, + # -0.3261143, + # 0.8997205, + # -0.21371077, + # -0.31840396, + # -0.23360962, + # 0.551947, + # ] + # ) + # # (Pdb) stats['action']['std'] + # # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510, + # # 0.0328, 0.0478, 0.0531, 0.0945, 0.0794]) + # stats["action"]["std"] = torch.tensor( + # [ + # 0.01252818, + # 0.2957442, + # 0.16701928, + # 0.04584508, + # 0.14833844, + # 0.08763024, + # 0.30665937, + # 0.10600077, + # 0.27572668, + # 0.1805853, + # 0.26304692, + # 0.30708534, + # 0.5305411, + # 0.38381037, + # ] + # ) + # transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode)) # noqa: F821 transforms = v2.Compose( [ @@ -173,7 +172,11 @@ def make_dataset( "action": [-0.1] + [i / clsfunc.fps for i in range(15)], } else: - delta_timestamps = None + delta_timestamps = { + "observation.images.top": [0], + "observation.state": [0], + "action": [i / clsfunc.fps for i in range(cfg.policy.horizon)], + } dataset = clsfunc( dataset_id=cfg.dataset_id, diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index 5071c09a..1aacc41d 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -19,11 +19,10 @@ from torch import Tensor, nn from torchvision.models._utils import IntermediateLayerGetter from torchvision.ops.misc import FrozenBatchNorm2d -from lerobot.common.policies.abstract import AbstractPolicy from lerobot.common.utils import get_safe_torch_device -class ActionChunkingTransformerPolicy(AbstractPolicy): +class ActionChunkingTransformerPolicy(nn.Module): """ Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (paper: https://arxiv.org/abs/2304.13705, code: https://github.com/tonyzhaozh/act) @@ -61,205 +60,20 @@ class ActionChunkingTransformerPolicy(AbstractPolicy): """ name = "act" + _multiple_obs_steps_not_handled_msg = ( + "ActionChunkingTransformerPolicy does not handle multiple observation steps." + ) def __init__(self, cfg, device, n_action_steps=1): """ TODO(alexander-soare): Add documentation for all parameters. """ - super().__init__(n_action_steps) + super().__init__() + if getattr(cfg, "n_obs_steps", 1) != 1: + raise ValueError(self._multiple_obs_steps_not_handled_msg) self.cfg = cfg self.n_action_steps = n_action_steps self.device = get_safe_torch_device(device) - - self.model = _ActionChunkingTransformer(cfg) - self._create_optimizer() - self.to(self.device) - - def _create_optimizer(self): - optimizer_params_dicts = [ - { - "params": [ - p - for n, p in self.model.named_parameters() - if not n.startswith("backbone") and p.requires_grad - ] - }, - { - "params": [ - p - for n, p in self.model.named_parameters() - if n.startswith("backbone") and p.requires_grad - ], - "lr": self.cfg.lr_backbone, - }, - ] - self.optimizer = torch.optim.AdamW( - optimizer_params_dicts, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay - ) - - def update(self, replay_buffer, step): - del step - - self.train() - - num_slices = self.cfg.batch_size - batch_size = self.cfg.horizon * num_slices - - assert batch_size % self.cfg.horizon == 0 - assert batch_size % num_slices == 0 - - def process_batch(batch, horizon, num_slices): - # trajectory t = 64, horizon h = 16 - # (t h) ... -> t h ... - batch = batch.reshape(num_slices, horizon) - - image = batch["observation", "image", "top"] - image = image[:, 0] # first observation t=0 - # batch, num_cam, channel, height, width - image = image.unsqueeze(1) - assert image.ndim == 5 - image = image.float() - - state = batch["observation", "state"] - state = state[:, 0] # first observation t=0 - # batch, qpos_dim - assert state.ndim == 2 - - action = batch["action"] - # batch, seq, action_dim - assert action.ndim == 3 - assert action.shape[1] == horizon - - if self.cfg.n_obs_steps > 1: - raise NotImplementedError() - # # keep first n observations of the slice corresponding to t=[-1,0] - # image = image[:, : self.cfg.n_obs_steps] - # state = state[:, : self.cfg.n_obs_steps] - - out = { - "obs": { - "image": image.to(self.device, non_blocking=True), - "agent_pos": state.to(self.device, non_blocking=True), - }, - "action": action.to(self.device, non_blocking=True), - } - return out - - start_time = time.time() - - batch = replay_buffer.sample(batch_size) - batch = process_batch(batch, self.cfg.horizon, num_slices) - - data_s = time.time() - start_time - - loss = self.compute_loss(batch) - loss.backward() - - grad_norm = torch.nn.utils.clip_grad_norm_( - self.model.parameters(), - self.cfg.grad_clip_norm, - error_if_nonfinite=False, - ) - - self.optimizer.step() - self.optimizer.zero_grad() - - info = { - "loss": loss.item(), - "grad_norm": float(grad_norm), - "lr": self.cfg.lr, - "data_s": data_s, - "update_s": time.time() - start_time, - } - - return info - - def save(self, fp): - torch.save(self.state_dict(), fp) - - def load(self, fp): - d = torch.load(fp) - self.load_state_dict(d) - - def compute_loss(self, batch): - loss_dict = self._forward( - qpos=batch["obs"]["agent_pos"], - image=batch["obs"]["image"], - actions=batch["action"], - ) - loss = loss_dict["loss"] - return loss - - @torch.no_grad() - def select_actions(self, observation, step_count): - # TODO(rcadene): remove unused step_count - del step_count - - self.eval() - - # TODO(rcadene): remove hack - # add 1 camera dimension - observation["image", "top"] = observation["image", "top"].unsqueeze(1) - - obs_dict = { - "image": observation["image", "top"], - "agent_pos": observation["state"], - } - action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"]) - - if self.cfg.temporal_agg: - # TODO(rcadene): implement temporal aggregation - raise NotImplementedError() - # all_time_actions[[t], t:t+num_queries] = action - # actions_for_curr_step = all_time_actions[:, t] - # actions_populated = torch.all(actions_for_curr_step != 0, axis=1) - # actions_for_curr_step = actions_for_curr_step[actions_populated] - # k = 0.01 - # exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step))) - # exp_weights = exp_weights / exp_weights.sum() - # exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1) - # raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True) - - # take first predicted action or n first actions - action = action[: self.n_action_steps] - return action - - def _forward(self, qpos, image, actions=None): - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - image = normalize(image) - - is_training = actions is not None - if is_training: # training time - actions = actions[:, : self.model.horizon] - - a_hat, (mu, log_sigma_x2) = self.model(qpos, image, actions) - - all_l1 = F.l1_loss(actions, a_hat, reduction="none") - l1 = all_l1.mean() - - loss_dict = {} - loss_dict["l1"] = l1 - if self.cfg.use_vae: - # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for - # each dimension independently, we sum over the latent dimension to get the total - # KL-divergence per batch element, then take the mean over the batch. - # (See App. B of https://arxiv.org/abs/1312.6114 for more details). - mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean() - loss_dict["kl"] = mean_kld - loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight - else: - loss_dict["loss"] = loss_dict["l1"] - return loss_dict - else: - action, _ = self.model(qpos, image) # no action, sample from prior - return action - - -# TODO(alexander-soare) move all this code into the policy when we have the policy API established. -class _ActionChunkingTransformer(nn.Module): - def __init__(self, cfg): - super().__init__() - self.camera_names = cfg.camera_names self.use_vae = cfg.use_vae self.horizon = cfg.horizon @@ -326,26 +140,179 @@ class _ActionChunkingTransformer(nn.Module): self._reset_parameters() + self._create_optimizer() + self.to(self.device) + + def _create_optimizer(self): + optimizer_params_dicts = [ + { + "params": [ + p for n, p in self.named_parameters() if not n.startswith("backbone") and p.requires_grad + ] + }, + { + "params": [ + p for n, p in self.named_parameters() if n.startswith("backbone") and p.requires_grad + ], + "lr": self.cfg.lr_backbone, + }, + ] + self.optimizer = torch.optim.AdamW( + optimizer_params_dicts, lr=self.cfg.lr, weight_decay=self.cfg.weight_decay + ) + def _reset_parameters(self): """Xavier-uniform initialization of the transformer parameters as in the original code.""" for p in chain(self.encoder.parameters(), self.decoder.parameters()): if p.dim() > 1: nn.init.xavier_uniform_(p) + @torch.no_grad() + def select_actions(self, observation, step_count): + # TODO(rcadene): remove unused step_count + del step_count + + self.eval() + + # TODO(rcadene): remove hack + # add 1 camera dimension + observation["image", "top"] = observation["image", "top"].unsqueeze(1) + + obs_dict = { + "image": observation["image", "top"], + "agent_pos": observation["state"], + } + action = self._forward(qpos=obs_dict["agent_pos"] * 0.182, image=obs_dict["image"]) + + if self.cfg.temporal_agg: + # TODO(rcadene): implement temporal aggregation + raise NotImplementedError() + # all_time_actions[[t], t:t+num_queries] = action + # actions_for_curr_step = all_time_actions[:, t] + # actions_populated = torch.all(actions_for_curr_step != 0, axis=1) + # actions_for_curr_step = actions_for_curr_step[actions_populated] + # k = 0.01 + # exp_weights = np.exp(-k * np.arange(len(actions_for_curr_step))) + # exp_weights = exp_weights / exp_weights.sum() + # exp_weights = torch.from_numpy(exp_weights).cuda().unsqueeze(dim=1) + # raw_action = (actions_for_curr_step * exp_weights).sum(dim=0, keepdim=True) + + # take first predicted action or n first actions + action = action[: self.n_action_steps] + return action + + def __call__(self, *args, **kwargs): + # TODO(now): Temporary bridge. + return self.update(*args, **kwargs) + + def _preprocess_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: + """ + Expects batch to have (at least): + { + "observation.state": (B, 1, J) tensor of robot states (joint configuration) + + "observation.images.top": (B, 1, C, H, W) tensor of images. + "action": (B, H, J) tensor of actions (positional target for robot joint configuration) + "action_is_pad": (B, H) mask for whether the actions are padding outside of the episode bounds. + } + """ + if batch["observation.state"].shape[1] != 1: + raise ValueError(self._multiple_obs_steps_not_handled_msg) + batch["observation.state"] = batch["observation.state"].squeeze(1) + # TODO(alexander-soare): generalize this to multiple images. Note: no squeeze is required for + # "observation.images.top" because then we'd have to unsqueeze to get get the image index dimension. + + def update(self, batch, *_): + start_time = time.time() + self._preprocess_batch(batch) + + self.train() + + num_slices = self.cfg.batch_size + batch_size = self.cfg.horizon * num_slices + + assert batch_size % self.cfg.horizon == 0 + assert batch_size % num_slices == 0 + + loss = self.compute_loss(batch) + loss.backward() + + grad_norm = torch.nn.utils.clip_grad_norm_( + self.parameters(), + self.cfg.grad_clip_norm, + error_if_nonfinite=False, + ) + + self.optimizer.step() + self.optimizer.zero_grad() + + info = { + "loss": loss.item(), + "grad_norm": float(grad_norm), + "lr": self.cfg.lr, + "update_s": time.time() - start_time, + } + + return info + + def compute_loss(self, batch): + loss_dict = self.forward( + robot_state=batch["observation.state"], + image=batch["observation.images.top"], + actions=batch["action"], + ) + loss = loss_dict["loss"] + return loss + def forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None): + # TODO(now): Maybe this shouldn't be here? + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + image = normalize(image) + + is_training = actions is not None + if is_training: # training time + actions = actions[:, : self.horizon] + + a_hat, (mu, log_sigma_x2) = self._forward(robot_state, image, actions) + + all_l1 = F.l1_loss(actions, a_hat, reduction="none") + l1 = all_l1.mean() + + loss_dict = {} + loss_dict["l1"] = l1 + if self.cfg.use_vae: + # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for + # each dimension independently, we sum over the latent dimension to get the total + # KL-divergence per batch element, then take the mean over the batch. + # (See App. B of https://arxiv.org/abs/1312.6114 for more details). + mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean() + loss_dict["kl"] = mean_kld + loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight + else: + loss_dict["loss"] = loss_dict["l1"] + return loss_dict + else: + action, _ = self._forward(robot_state, image) # no action, sample from prior + return action + + def _forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None): """ Args: robot_state: (B, J) batch of robot joint configurations. image: (B, N, C, H, W) batch of N camera frames. actions: (B, S, A) batch of actions from the target dataset which must be provided if the VAE is enabled and the model is in training mode. + Returns: + (B, S, A) batch of action sequences + Tuple containing the latent PDF's parameters (mean, log(σ²)) both as (B, L) tensors where L is the + latent dimension. """ if self.use_vae and self.training: assert ( actions is not None ), "actions must be provided when using the variational objective in training mode." - batch_size, _ = robot_state.shape + batch_size = robot_state.shape[0] # Prepare the latent for input to the transformer encoder. if self.use_vae and actions is not None: @@ -428,6 +395,13 @@ class _ActionChunkingTransformer(nn.Module): return actions, [mu, log_sigma_x2] + def save(self, fp): + torch.save(self.state_dict(), fp) + + def load(self, fp): + d = torch.load(fp) + self.load_state_dict(d) + class _TransformerEncoder(nn.Module): """Convenience module for running multiple encoder layers, maybe followed by normalization.""" diff --git a/lerobot/common/policies/diffusion/policy.py b/lerobot/common/policies/diffusion/policy.py index a4f4a450..93e5ba5d 100644 --- a/lerobot/common/policies/diffusion/policy.py +++ b/lerobot/common/policies/diffusion/policy.py @@ -152,7 +152,6 @@ class DiffusionPolicy(nn.Module): self.diffusion.train() data_s = time.time() - start_time - loss = self.diffusion.compute_loss(batch) loss.backward() diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index 631ecc93..d49dfff8 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -41,7 +41,6 @@ def log_train_info(logger, info, step, cfg, dataset, is_offline): loss = info["loss"] grad_norm = info["grad_norm"] lr = info["lr"] - data_s = info["data_s"] update_s = info["update_s"] # A sample is an (observation,action) pair, where observation and action @@ -62,7 +61,6 @@ def log_train_info(logger, info, step, cfg, dataset, is_offline): f"grdn:{grad_norm:.3f}", f"lr:{lr:0.1e}", # in seconds - f"data_s:{data_s:.3f}", f"updt_s:{update_s:.3f}", ] logging.info(" ".join(log_items)) @@ -200,7 +198,7 @@ def train(cfg: dict, out_dir=None, job_name=None): is_offline = True dataloader = torch.utils.data.DataLoader( dataset, - num_workers=4, + num_workers=0, batch_size=cfg.policy.batch_size, shuffle=True, pin_memory=cfg.device != "cpu", diff --git a/poetry.lock b/poetry.lock index 0cbf9318..b8c6c638 100644 --- a/poetry.lock +++ b/poetry.lock @@ -880,6 +880,29 @@ files = [ [package.extras] protobuf = ["grpcio-tools (>=1.62.1)"] +[[package]] +name = "gym-pusht" +version = "0.1.0" +description = "PushT environment for LeRobot" +optional = true +python-versions = "^3.10" +files = [] +develop = false + +[package.dependencies] +gymnasium = "^0.29.1" +opencv-python = "^4.9.0.80" +pygame = "^2.5.2" +pymunk = "^6.6.0" +scikit-image = "^0.22.0" +shapely = "^2.0.3" + +[package.source] +type = "git" +url = "git@github.com:huggingface/gym-pusht.git" +reference = "HEAD" +resolved_reference = "0fe4449cca5a2b08f529f7a07fbf5b9df24962ec" + [[package]] name = "gymnasium" version = "0.29.1" @@ -1261,17 +1284,21 @@ setuptools = "!=50.0.0" [[package]] name = "lazy-loader" -version = "0.3" -description = "lazy_loader" +version = "0.4" +description = "Makes it easy to load subpackages and functions on demand." optional = false python-versions = ">=3.7" files = [ - {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"}, - {file = "lazy_loader-0.3.tar.gz", hash = "sha256:3b68898e34f5b2a29daaaac172c6555512d0f32074f147e2254e4a6d9d838f37"}, + {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"}, + {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"}, ] +[package.dependencies] +packaging = "*" + [package.extras] -lint = ["pre-commit (>=3.3)"] +dev = ["changelist (==0.5)"] +lint = ["pre-commit (==3.7.0)"] test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"] [[package]] @@ -3274,7 +3301,7 @@ protobuf = ">=3.20" [[package]] name = "tensordict" -version = "0.4.0+b4c91e8" +version = "0.4.0+f622b2f" description = "" optional = false python-versions = "*" @@ -3518,13 +3545,13 @@ tutorials = ["matplotlib", "pandas", "tabulate", "torch"] [[package]] name = "typing-extensions" -version = "4.10.0" +version = "4.11.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" files = [ - {file = "typing_extensions-4.10.0-py3-none-any.whl", hash = "sha256:69b1a937c3a517342112fb4c6df7e72fc39a38e7891a5730ed4985b5214b5475"}, - {file = "typing_extensions-4.10.0.tar.gz", hash = "sha256:b0abd7c89e8fb96f98db18d86106ff1d90ab692004eb746cf6eda2682f91b3cb"}, + {file = "typing_extensions-4.11.0-py3-none-any.whl", hash = "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a"}, + {file = "typing_extensions-4.11.0.tar.gz", hash = "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0"}, ] [[package]] @@ -3667,9 +3694,9 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -pusht = [] +pusht = ["gym_pusht"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "04b17fa57f189ad63181611d2e724d7fbdfb3485bc1a587b259d0a3751db918d" +content-hash = "3eee17e4bf2b7a570f41ef9c400ec5a24a3113f62a13162229cf43504ca0d005" diff --git a/pyproject.toml b/pyproject.toml index f0869158..a7d2dd65 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,6 +52,7 @@ robomimic = "0.2.0" gymnasium-robotics = "^1.2.4" gymnasium = "^0.29.1" cmake = "^3.29.0.1" +gym_pusht = { git = "git@github.com:huggingface/gym-pusht.git", optional = true} [tool.poetry.extras] pusht = ["gym_pusht"] From 8d2463f45b4cd22f5ce6e38b7beade9231e52f37 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Fri, 5 Apr 2024 18:46:30 +0100 Subject: [PATCH 09/25] backup wip --- lerobot/common/policies/abstract.py | 76 ------------------ lerobot/common/policies/act/policy.py | 111 ++++++++++++++++++++------ lerobot/scripts/eval.py | 2 +- lerobot/scripts/train.py | 3 +- scripts/convert_act_weights.py | 33 ++++---- 5 files changed, 105 insertions(+), 120 deletions(-) diff --git a/lerobot/common/policies/abstract.py b/lerobot/common/policies/abstract.py index 6dc72bef..beebd8ac 100644 --- a/lerobot/common/policies/abstract.py +++ b/lerobot/common/policies/abstract.py @@ -4,79 +4,3 @@ import torch from torch import Tensor, nn -class AbstractPolicy(nn.Module): - """Base policy which all policies should be derived from. - - The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its - documentation for more information. - - Note: - When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to: - 1. set the required class attributes: - - for classes inheriting from `AbstractDataset`: `available_datasets` - - for classes inheriting from `AbstractEnv`: `name`, `available_tasks` - - for classes inheriting from `AbstractPolicy`: `name` - 2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`) - 3. update variables in `tests/test_available.py` by importing your new class - """ - - name: str | None = None # same name should be used to instantiate the policy in factory.py - - def __init__(self, n_action_steps: int | None): - """ - n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single - action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then - adds that dimension. - """ - super().__init__() - assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute." - self.n_action_steps = n_action_steps - self.clear_action_queue() - - def update(self, replay_buffer, step): - """One step of the policy's learning algorithm.""" - raise NotImplementedError("Abstract method") - - def save(self, fp): - torch.save(self.state_dict(), fp) - - def load(self, fp): - d = torch.load(fp) - self.load_state_dict(d) - - def select_actions(self, observation) -> Tensor: - """Select an action (or trajectory of actions) based on an observation during rollout. - - If n_action_steps was provided at initialization, this should return a (batch_size, n_action_steps, *) tensor of - actions. Otherwise if n_actions_steps is None, this should return a (batch_size, *) tensor of actions. - """ - raise NotImplementedError("Abstract method") - - def clear_action_queue(self): - """This should be called whenever the environment is reset.""" - if self.n_action_steps is not None: - self._action_queue = deque([], maxlen=self.n_action_steps) - - def forward(self, *args, **kwargs) -> Tensor: - """Inference step that makes multi-step policies compatible with their single-step environments. - - WARNING: In general, this should not be overriden. - - Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit - into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an - observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment - observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that - the subclass doesn't have to. - - This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made: - 1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is - the action trajectory horizon and * is the action dimensions. - 2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined. - """ - if self.n_action_steps is None: - return self.select_actions(*args, **kwargs) - if len(self._action_queue) == 0: - # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape - # (n_action_steps, batch_size, *), hence the transpose. - self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1)) - return self._action_queue.popleft() diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index f42c6a3c..a9a5ac06 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -3,7 +3,7 @@ As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705). The majority of changes here involve removing unused code, unifying naming, and adding helpful comments. """ - +from collections import deque import math import time from itertools import chain @@ -22,6 +22,67 @@ from torchvision.ops.misc import FrozenBatchNorm2d from lerobot.common.utils import get_safe_torch_device +# class AbstractPolicy(nn.Module): +# """Base policy which all policies should be derived from. + +# The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its +# documentation for more information. + +# Note: +# When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to: +# 1. set the required class attributes: +# - for classes inheriting from `AbstractDataset`: `available_datasets` +# - for classes inheriting from `AbstractEnv`: `name`, `available_tasks` +# - for classes inheriting from `AbstractPolicy`: `name` +# 2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`) +# 3. update variables in `tests/test_available.py` by importing your new class +# """ + +# name: str | None = None # same name should be used to instantiate the policy in factory.py + +# def __init__(self, n_action_steps: int | None): +# """ +# n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single +# action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then +# adds that dimension. +# """ +# super().__init__() +# assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute." +# self.n_action_steps = n_action_steps +# self.clear_action_queue() + +# def clear_action_queue(self): +# """This should be called whenever the environment is reset.""" +# if self.n_action_steps is not None: +# self._action_queue = deque([], maxlen=self.n_action_steps) + +# def forward(self, fn) -> Tensor: +# """Inference step that makes multi-step policies compatible with their single-step environments. + +# WARNING: In general, this should not be overriden. + +# Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit +# into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an +# observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment +# observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that +# the subclass doesn't have to. + +# This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made: +# 1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is +# the action trajectory horizon and * is the action dimensions. +# 2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined. +# """ +# if self.n_action_steps is None: +# return self.select_actions(*args, **kwargs) +# if len(self._action_queue) == 0: +# # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape +# # (n_action_steps, batch_size, *), hence the transpose. +# self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1)) +# return self._action_queue.popleft() + + + + class ActionChunkingTransformerPolicy(nn.Module): """ Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost @@ -168,14 +229,16 @@ class ActionChunkingTransformerPolicy(nn.Module): nn.init.xavier_uniform_(p) @torch.no_grad() - def select_actions(self, batch, *_): + def select_action(self, batch, *_): # TODO(now): Implement queueing mechanism. self.eval() self._preprocess_batch(batch) # TODO(now): What's up with this 0.182? action = self.forward( - robot_state=batch["observation.state"] * 0.182, image=batch["observation.images.top"] + robot_state=batch["observation.state"] * 0.182, + image=batch["observation.images.top"], + return_loss=False, ) if self.cfg.temporal_agg: @@ -226,7 +289,7 @@ class ActionChunkingTransformerPolicy(nn.Module): assert batch_size % self.cfg.horizon == 0 assert batch_size % num_slices == 0 - loss = self.compute_loss(batch) + loss = self.forward(batch, return_loss=True)["loss"] loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( @@ -247,44 +310,38 @@ class ActionChunkingTransformerPolicy(nn.Module): return info - def compute_loss(self, batch): - loss_dict = self.forward( - robot_state=batch["observation.state"], - image=batch["observation.images.top"], - actions=batch["action"], - ) - loss = loss_dict["loss"] - return loss - - def forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None): + def forward(self, batch: dict[str, Tensor], return_loss: bool = False): # TODO(now): Maybe this shouldn't be here? normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - image = normalize(image) + images = normalize(batch["observation.images.top"]) - is_training = actions is not None - if is_training: # training time - actions = actions[:, : self.horizon] + if return_loss: # training time + actions_hat, (mu_hat, log_sigma_x2_hat) = self._forward( + batch["observation.state"], images, batch["action"] + ) - a_hat, (mu, log_sigma_x2) = self._forward(robot_state, image, actions) - - all_l1 = F.l1_loss(actions, a_hat, reduction="none") - l1 = all_l1.mean() + l1_loss = ( + F.l1_loss(batch["action"], actions_hat, reduction="none") + * ~batch["action_is_pad"].unsqueeze(-1) + ).mean() loss_dict = {} - loss_dict["l1"] = l1 + loss_dict["l1"] = l1_loss if self.cfg.use_vae: # Calculate Dₖₗ(latent_pdf || standard_normal). Note: After computing the KL-divergence for # each dimension independently, we sum over the latent dimension to get the total # KL-divergence per batch element, then take the mean over the batch. # (See App. B of https://arxiv.org/abs/1312.6114 for more details). - mean_kld = (-0.5 * (1 + log_sigma_x2 - mu.pow(2) - (log_sigma_x2).exp())).sum(-1).mean() + mean_kld = ( + (-0.5 * (1 + log_sigma_x2_hat - mu_hat.pow(2) - (log_sigma_x2_hat).exp())).sum(-1).mean() + ) loss_dict["kl"] = mean_kld loss_dict["loss"] = loss_dict["l1"] + loss_dict["kl"] * self.cfg.kl_weight else: loss_dict["loss"] = loss_dict["l1"] return loss_dict else: - action, _ = self._forward(robot_state, image) # no action, sample from prior + action, _ = self._forward(batch["observation.state"], images) return action def _forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None): @@ -321,7 +378,9 @@ class ActionChunkingTransformerPolicy(nn.Module): # Forward pass through VAE encoder and sample the latent with the reparameterization trick. cls_token_out = self.vae_encoder( vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2) - )[0] # (B, D) + )[ + 0 + ] # (B, D) latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out) mu = latent_pdf_params[:, : self.latent_dim] # This is 2log(sigma). Done this way to match the original implementation. diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index e7ba53fc..b05f9704 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -251,7 +251,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None): dataset = make_dataset(cfg, stats_path=stats_path) logging.info("Making environment.") - env = make_env(cfg, num_parallel_envs=cfg.eval_episodes) + env = make_env(cfg, num_parallel_envs=cfg.rollout_batch_size) # when policy is None, rollout a random policy policy = make_policy(cfg) if cfg.policy.pretrained_model_path else None diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index d49dfff8..81f3cdbc 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -148,7 +148,8 @@ def train(cfg: dict, out_dir=None, job_name=None): # ) logging.info("make_env") - env = make_env(cfg, num_parallel_envs=cfg.eval_episodes) + # TODO(now): uncomment + #env = make_env(cfg, num_parallel_envs=cfg.eval_episodes) logging.info("make_policy") policy = make_policy(cfg) diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py index b1492009..d5e38796 100644 --- a/scripts/convert_act_weights.py +++ b/scripts/convert_act_weights.py @@ -28,22 +28,23 @@ for to_remove in start_removals: # Replace keys based on what they start with. start_replacements = [ - ("model.query_embed.weight", "model.pos_embed.weight"), - ("model.pos_table", "model.vae_encoder_pos_enc"), - ("model.pos_embed.weight", "model.decoder_pos_embed.weight"), - ("model.encoder.", "model.vae_encoder."), - ("model.encoder_action_proj.", "model.vae_encoder_action_input_proj."), - ("model.encoder_joint_proj.", "model.vae_encoder_robot_state_input_proj."), - ("model.latent_proj.", "model.vae_encoder_latent_output_proj."), - ("model.latent_proj.", "model.vae_encoder_latent_output_proj."), - ("model.input_proj.", "model.encoder_img_feat_input_proj."), - ("model.input_proj_robot_state", "model.encoder_robot_state_input_proj"), - ("model.latent_out_proj.", "model.encoder_latent_input_proj."), - ("model.transformer.encoder.", "model.encoder."), - ("model.transformer.decoder.", "model.decoder."), - ("model.backbones.0.0.body.", "model.backbone."), - ("model.additional_pos_embed.weight", "model.encoder_robot_and_latent_pos_embed.weight"), - ("model.cls_embed.weight", "model.vae_encoder_cls_embed.weight"), + ("model.", ""), + ("query_embed.weight", "pos_embed.weight"), + ("pos_table", "vae_encoder_pos_enc"), + ("pos_embed.weight", "decoder_pos_embed.weight"), + ("encoder.", "vae_encoder."), + ("encoder_action_proj.", "vae_encoder_action_input_proj."), + ("encoder_joint_proj.", "vae_encoder_robot_state_input_proj."), + ("latent_proj.", "vae_encoder_latent_output_proj."), + ("latent_proj.", "vae_encoder_latent_output_proj."), + ("input_proj.", "encoder_img_feat_input_proj."), + ("input_proj_robot_state", "encoder_robot_state_input_proj"), + ("latent_out_proj.", "encoder_latent_input_proj."), + ("transformer.encoder.", "encoder."), + ("transformer.decoder.", "decoder."), + ("backbones.0.0.body.", "backbone."), + ("additional_pos_embed.weight", "encoder_robot_and_latent_pos_embed.weight"), + ("cls_embed.weight", "vae_encoder_cls_embed.weight"), ] for to_replace, replace_with in start_replacements: From 1bab4a1dd5fab56f18306496077e7a9db9c9b2fc Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Mon, 8 Apr 2024 10:23:26 +0100 Subject: [PATCH 10/25] Eval reproduction works with gym_aloha --- lerobot/common/envs/factory.py | 2 +- lerobot/common/policies/act/policy.py | 130 +++++++++----------------- lerobot/common/policies/factory.py | 1 - lerobot/configs/policy/act.yaml | 2 +- lerobot/scripts/eval.py | 8 +- poetry.lock | 26 +++--- 6 files changed, 66 insertions(+), 103 deletions(-) diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py index 971f4b63..749bb533 100644 --- a/lerobot/common/envs/factory.py +++ b/lerobot/common/envs/factory.py @@ -35,7 +35,7 @@ def make_env(cfg, num_parallel_envs=0) -> gym.Env | gym.vector.SyncVectorEnv: kwargs["task"] = cfg.env.task env_fn = lambda: gym.make( # noqa: E731 - "gym_aloha/AlohaInsertion-v0", + "gym_aloha/AlohaTransferCube-v0", **kwargs, ) else: diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index a9a5ac06..75d5ca0e 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -3,9 +3,10 @@ As per Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware (https://arxiv.org/abs/2304.13705). The majority of changes here involve removing unused code, unifying naming, and adding helpful comments. """ -from collections import deque + import math import time +from collections import deque from itertools import chain from typing import Callable @@ -22,67 +23,6 @@ from torchvision.ops.misc import FrozenBatchNorm2d from lerobot.common.utils import get_safe_torch_device -# class AbstractPolicy(nn.Module): -# """Base policy which all policies should be derived from. - -# The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its -# documentation for more information. - -# Note: -# When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to: -# 1. set the required class attributes: -# - for classes inheriting from `AbstractDataset`: `available_datasets` -# - for classes inheriting from `AbstractEnv`: `name`, `available_tasks` -# - for classes inheriting from `AbstractPolicy`: `name` -# 2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`) -# 3. update variables in `tests/test_available.py` by importing your new class -# """ - -# name: str | None = None # same name should be used to instantiate the policy in factory.py - -# def __init__(self, n_action_steps: int | None): -# """ -# n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single -# action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then -# adds that dimension. -# """ -# super().__init__() -# assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute." -# self.n_action_steps = n_action_steps -# self.clear_action_queue() - -# def clear_action_queue(self): -# """This should be called whenever the environment is reset.""" -# if self.n_action_steps is not None: -# self._action_queue = deque([], maxlen=self.n_action_steps) - -# def forward(self, fn) -> Tensor: -# """Inference step that makes multi-step policies compatible with their single-step environments. - -# WARNING: In general, this should not be overriden. - -# Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit -# into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an -# observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment -# observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that -# the subclass doesn't have to. - -# This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made: -# 1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is -# the action trajectory horizon and * is the action dimensions. -# 2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined. -# """ -# if self.n_action_steps is None: -# return self.select_actions(*args, **kwargs) -# if len(self._action_queue) == 0: -# # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape -# # (n_action_steps, batch_size, *), hence the transpose. -# self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1)) -# return self._action_queue.popleft() - - - - class ActionChunkingTransformerPolicy(nn.Module): """ Action Chunking Transformer Policy as per Learning Fine-Grained Bimanual Manipulation with Low-Cost @@ -228,18 +168,30 @@ class ActionChunkingTransformerPolicy(nn.Module): if p.dim() > 1: nn.init.xavier_uniform_(p) - @torch.no_grad() - def select_action(self, batch, *_): - # TODO(now): Implement queueing mechanism. - self.eval() - self._preprocess_batch(batch) + def reset(self): + """This should be called whenever the environment is reset.""" + if self.n_action_steps is not None: + self._action_queue = deque([], maxlen=self.n_action_steps) - # TODO(now): What's up with this 0.182? - action = self.forward( - robot_state=batch["observation.state"] * 0.182, - image=batch["observation.images.top"], - return_loss=False, - ) + def select_action(self, batch: dict[str, Tensor], *_): + """ + This method wraps `select_actions` in order to return one action at a time for execution in the + environment. It works by managing the actions in a queue and only calling `select_actions` when the + queue is empty. + """ + if len(self._action_queue) == 0: + # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape + # (n_action_steps, batch_size, *), hence the transpose. + self._action_queue.extend(self.select_actions(batch).transpose(0, 1)) + return self._action_queue.popleft() + + @torch.no_grad() + def select_actions(self, batch: dict[str, Tensor]): + """Use the action chunking transformer to generate a sequence of actions.""" + self.eval() + self._preprocess_batch(batch, add_obs_steps_dim=True) + + action = self.forward(batch, return_loss=False) if self.cfg.temporal_agg: # TODO(rcadene): implement temporal aggregation @@ -257,25 +209,37 @@ class ActionChunkingTransformerPolicy(nn.Module): return action[: self.n_action_steps] def __call__(self, *args, **kwargs): - # TODO(now): Temporary bridge. + # TODO(now): Temporary bridge until we know what to do about the `update` method. return self.update(*args, **kwargs) - def _preprocess_batch(self, batch: dict[str, Tensor]) -> dict[str, Tensor]: + def _preprocess_batch( + self, batch: dict[str, Tensor], add_obs_steps_dim: bool = False + ) -> dict[str, Tensor]: """ - Expects batch to have (at least): + This function expects `batch` to have (at least): { - "observation.state": (B, 1, J) tensor of robot states (joint configuration) - - "observation.images.top": (B, 1, C, H, W) tensor of images. + "observation.state": (B, 1, J) OR (B, J) tensor of robot states (joint configuration). + "observation.images.top": (B, 1, C, H, W) OR (B, C, H, W) tensor of images. "action": (B, H, J) tensor of actions (positional target for robot joint configuration) "action_is_pad": (B, H) mask for whether the actions are padding outside of the episode bounds. } """ + if add_obs_steps_dim: + # Add a dimension for the observations steps. Since n_obs_steps > 1 is not supported right now, + # this just amounts to an unsqueeze. + for k in batch: + if k.startswith("observation."): + batch[k] = batch[k].unsqueeze(1) + if batch["observation.state"].shape[1] != 1: raise ValueError(self._multiple_obs_steps_not_handled_msg) batch["observation.state"] = batch["observation.state"].squeeze(1) - # TODO(alexander-soare): generalize this to multiple images. Note: no squeeze is required for - # "observation.images.top" because then we'd have to unsqueeze to get get the image index dimension. + # TODO(alexander-soare): generalize this to multiple images. + assert ( + sum(k.startswith("observation.images.") and not k.endswith("is_pad") for k in batch) == 1 + ), "ACT only handles one image for now." + # Note: no squeeze is required for "observation.images.top" because then we'd have to unsqueeze to get + # the image index dimension. def update(self, batch, *_): start_time = time.time() @@ -378,9 +342,7 @@ class ActionChunkingTransformerPolicy(nn.Module): # Forward pass through VAE encoder and sample the latent with the reparameterization trick. cls_token_out = self.vae_encoder( vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2) - )[ - 0 - ] # (B, D) + )[0] # (B, D) latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out) mu = latent_pdf_params[:, : self.latent_dim] # This is 2log(sigma). Done this way to match the original implementation. diff --git a/lerobot/common/policies/factory.py b/lerobot/common/policies/factory.py index 90e7ecc1..cc956014 100644 --- a/lerobot/common/policies/factory.py +++ b/lerobot/common/policies/factory.py @@ -26,7 +26,6 @@ def make_policy(cfg): policy = ActionChunkingTransformerPolicy( cfg.policy, cfg.device, - n_obs_steps=cfg.n_obs_steps, n_action_steps=cfg.n_action_steps, ) else: diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index c1d1801f..80f50003 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -58,6 +58,6 @@ policy: action_dim: ??? delta_timestamps: - observation.image: [0.0] + observation.images.top: [0.0] observation.state: [0.0] action: [0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26, 0.28, 0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 0.76, 0.78, 0.8, 0.82, 0.84, 0.86, 0.88, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0, 1.02, 1.04, 1.06, 1.08, 1.1, 1.12, 1.14, 1.16, 1.18, 1.2, 1.22, 1.24, 1.26, 1.28, 1.3, 1.32, 1.34, 1.36, 1.38, 1.40, 1.42, 1.44, 1.46, 1.48, 1.5, 1.52, 1.54, 1.56, 1.58, 1.6, 1.62, 1.64, 1.66, 1.68, 1.7, 1.72, 1.74, 1.76, 1.78, 1.8, 1.82, 1.84, 1.86, 1.88, 1.90, 1.92, 1.94, 1.96, 1.98] diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index b05f9704..b43f4ed1 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -89,7 +89,9 @@ def eval_policy( visu = env.envs[0].render(mode="visualization") visu = visu[None, ...] # add batch dim else: - visu = np.stack([env.render(mode="visualization") for env in env.envs]) + # TODO(now): Put mode back in. + visu = np.stack([env.render() for env in env.envs]) + # visu = np.stack([env.render(mode="visualization") for env in env.envs]) ep_frames.append(visu) # noqa: B023 for _ in range(num_episodes): @@ -248,7 +250,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None): logging.info("Making transforms.") # TODO(alexander-soare): Completely decouple datasets from evaluation. - dataset = make_dataset(cfg, stats_path=stats_path) + transform = make_dataset(cfg, stats_path=stats_path).transform logging.info("Making environment.") env = make_env(cfg, num_parallel_envs=cfg.rollout_batch_size) @@ -263,7 +265,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None): video_dir=Path(out_dir) / "eval", fps=cfg.env.fps, # TODO(rcadene): what should we do with the transform? - transform=dataset.transform, + transform=transform, seed=cfg.seed, ) print(info["aggregated"]) diff --git a/poetry.lock b/poetry.lock index f96f66bc..60354b8a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -941,7 +941,7 @@ mujoco = "^2.3.7" type = "git" url = "git@github.com:huggingface/gym-xarm.git" reference = "HEAD" -resolved_reference = "2eb83fc4fc871b9d271c946d169e42f226ac3a7c" +resolved_reference = "08ddd5a9400783a6898bbf3c3014fc5da3961b9d" [[package]] name = "gymnasium" @@ -1709,20 +1709,20 @@ pyopengl = "*" [[package]] name = "networkx" -version = "3.2.1" +version = "3.3" description = "Python package for creating and manipulating graphs and networks" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" files = [ - {file = "networkx-3.2.1-py3-none-any.whl", hash = "sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2"}, - {file = "networkx-3.2.1.tar.gz", hash = "sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6"}, + {file = "networkx-3.3-py3-none-any.whl", hash = "sha256:28575580c6ebdaf4505b22c6256a2b9de86b316dc63ba9e93abde3d78dfdbcf2"}, + {file = "networkx-3.3.tar.gz", hash = "sha256:0c127d8b2f4865f59ae9cb8aafcd60b5c70f3241ebd66f7defad7c4ab90126c9"}, ] [package.extras] -default = ["matplotlib (>=3.5)", "numpy (>=1.22)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"] -developer = ["changelist (==0.4)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"] -doc = ["nb2plots (>=0.7)", "nbconvert (<7.9)", "numpydoc (>=1.6)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"] -extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.11)", "sympy (>=1.10)"] +default = ["matplotlib (>=3.6)", "numpy (>=1.23)", "pandas (>=1.4)", "scipy (>=1.9,!=1.11.0,!=1.11.1)"] +developer = ["changelist (==0.5)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"] +doc = ["myst-nb (>=1.0)", "numpydoc (>=1.7)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.14)", "sphinx (>=7)", "sphinx-gallery (>=0.14)", "texext (>=0.6.7)"] +extra = ["lxml (>=4.6)", "pydot (>=2.0)", "pygraphviz (>=1.12)", "sympy (>=1.10)"] test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] [[package]] @@ -3699,20 +3699,20 @@ watchdog = ["watchdog (>=2.3)"] [[package]] name = "zarr" -version = "2.17.1" +version = "2.17.2" description = "An implementation of chunked, compressed, N-dimensional arrays for Python" optional = false python-versions = ">=3.9" files = [ - {file = "zarr-2.17.1-py3-none-any.whl", hash = "sha256:e25df2741a6e92645f3890f30f3136d5b57a0f8f831094b024bbcab5f2797bc7"}, - {file = "zarr-2.17.1.tar.gz", hash = "sha256:564b3aa072122546fe69a0fa21736f466b20fad41754334b62619f088ce46261"}, + {file = "zarr-2.17.2-py3-none-any.whl", hash = "sha256:70d7cc07c24280c380ef80644151d136b7503b0d83c9f214e8000ddc0f57f69b"}, + {file = "zarr-2.17.2.tar.gz", hash = "sha256:2cbaa6cb4e342d45152d4a7a4b2013c337fcd3a8e7bc98253560180de60552ce"}, ] [package.dependencies] asciitree = "*" fasteners = {version = "*", markers = "sys_platform != \"emscripten\""} numcodecs = ">=0.10.0" -numpy = ">=1.21.1" +numpy = ">=1.23" [package.extras] docs = ["numcodecs[msgpack]", "numpydoc", "pydata-sphinx-theme", "sphinx", "sphinx-automodapi", "sphinx-copybutton", "sphinx-design", "sphinx-issues"] From 863f28ffd8883cf0b21ebc4bd4f57c327ecb0cd2 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Mon, 8 Apr 2024 13:10:19 +0100 Subject: [PATCH 11/25] ready for review --- lerobot/common/datasets/factory.py | 88 +-------------------- lerobot/common/policies/abstract.py | 76 ++++++++++++++++++ lerobot/common/policies/act/policy.py | 9 ++- lerobot/common/policies/diffusion/policy.py | 2 - lerobot/configs/policy/act.yaml | 10 ++- lerobot/scripts/eval.py | 4 +- scripts/convert_act_weights.py | 71 ----------------- 7 files changed, 92 insertions(+), 168 deletions(-) delete mode 100644 scripts/convert_act_weights.py diff --git a/lerobot/common/datasets/factory.py b/lerobot/common/datasets/factory.py index ed7854ff..c22ae698 100644 --- a/lerobot/common/datasets/factory.py +++ b/lerobot/common/datasets/factory.py @@ -59,96 +59,10 @@ def make_dataset( transform=Prod(in_keys=clsfunc.image_keys, prod=1 / 255.0), ) stats = compute_or_load_stats(stats_dataset) + # TODO(rcadene): remove this and put it in config. Ideally we want to reproduce SOTA results just with mean_std normalization_mode = "mean_std" if cfg.env.name == "aloha" else "min_max" - # # TODO(now): These stats are needed to use their pretrained model for sim_transfer_cube_human. - # # (Pdb) stats['observation']['state']['mean'] - # # tensor([-0.0071, -0.6293, 1.0351, -0.0517, -0.4642, -0.0754, 0.4751, -0.0373, - # # -0.3324, 0.9034, -0.2258, -0.3127, -0.2412, 0.6866]) - # stats["observation", "state", "mean"] = torch.tensor( - # [ - # -0.00740268, - # -0.63187766, - # 1.0356655, - # -0.05027218, - # -0.46199223, - # -0.07467502, - # 0.47467607, - # -0.03615446, - # -0.33203387, - # 0.9038929, - # -0.22060776, - # -0.31011587, - # -0.23484458, - # 0.6842416, - # ] - # ) - # # (Pdb) stats['observation']['state']['std'] - # # tensor([0.0022, 0.0520, 0.0291, 0.0092, 0.0267, 0.0145, 0.0563, 0.0179, 0.0494, - # # 0.0326, 0.0476, 0.0535, 0.0956, 0.0513]) - # stats["observation", "state", "std"] = torch.tensor( - # [ - # 0.01219023, - # 0.2975381, - # 0.16728032, - # 0.04733803, - # 0.1486037, - # 0.08788499, - # 0.31752336, - # 0.1049916, - # 0.27933604, - # 0.18094037, - # 0.26604933, - # 0.30466506, - # 0.5298686, - # 0.25505227, - # ] - # ) - # # (Pdb) stats['action']['mean'] - # # tensor([-0.0075, -0.6346, 1.0353, -0.0465, -0.4686, -0.0738, 0.3723, -0.0396, - # # -0.3184, 0.8991, -0.2065, -0.3182, -0.2338, 0.5593]) - # stats["action"]["mean"] = torch.tensor( - # [ - # -0.00756444, - # -0.6281845, - # 1.0312834, - # -0.04664314, - # -0.47211358, - # -0.074527, - # 0.37389806, - # -0.03718753, - # -0.3261143, - # 0.8997205, - # -0.21371077, - # -0.31840396, - # -0.23360962, - # 0.551947, - # ] - # ) - # # (Pdb) stats['action']['std'] - # # tensor([0.0023, 0.0514, 0.0290, 0.0086, 0.0263, 0.0143, 0.0593, 0.0185, 0.0510, - # # 0.0328, 0.0478, 0.0531, 0.0945, 0.0794]) - # stats["action"]["std"] = torch.tensor( - # [ - # 0.01252818, - # 0.2957442, - # 0.16701928, - # 0.04584508, - # 0.14833844, - # 0.08763024, - # 0.30665937, - # 0.10600077, - # 0.27572668, - # 0.1805853, - # 0.26304692, - # 0.30708534, - # 0.5305411, - # 0.38381037, - # ] - # ) - # transforms.append(NormalizeTransform(stats, in_keys, mode=normalization_mode)) # noqa: F821 - transforms = v2.Compose( [ # TODO(rcadene): we need to do something about image_keys diff --git a/lerobot/common/policies/abstract.py b/lerobot/common/policies/abstract.py index beebd8ac..6dc72bef 100644 --- a/lerobot/common/policies/abstract.py +++ b/lerobot/common/policies/abstract.py @@ -4,3 +4,79 @@ import torch from torch import Tensor, nn +class AbstractPolicy(nn.Module): + """Base policy which all policies should be derived from. + + The forward method should generally not be overriden as it plays the role of handling multi-step policies. See its + documentation for more information. + + Note: + When implementing a concrete class (e.g. `AlohaDataset`, `PushtEnv`, `DiffusionPolicy`), you need to: + 1. set the required class attributes: + - for classes inheriting from `AbstractDataset`: `available_datasets` + - for classes inheriting from `AbstractEnv`: `name`, `available_tasks` + - for classes inheriting from `AbstractPolicy`: `name` + 2. update variables in `lerobot/__init__.py` (e.g. `available_envs`, `available_datasets_per_envs`, `available_policies`) + 3. update variables in `tests/test_available.py` by importing your new class + """ + + name: str | None = None # same name should be used to instantiate the policy in factory.py + + def __init__(self, n_action_steps: int | None): + """ + n_action_steps: Sets the cache size for storing action trajectories. If None, it is assumed that a single + action is returned by `select_actions` and that doesn't have a horizon dimension. The `forward` method then + adds that dimension. + """ + super().__init__() + assert self.name is not None, "Subclasses of `AbstractPolicy` should set the `name` class attribute." + self.n_action_steps = n_action_steps + self.clear_action_queue() + + def update(self, replay_buffer, step): + """One step of the policy's learning algorithm.""" + raise NotImplementedError("Abstract method") + + def save(self, fp): + torch.save(self.state_dict(), fp) + + def load(self, fp): + d = torch.load(fp) + self.load_state_dict(d) + + def select_actions(self, observation) -> Tensor: + """Select an action (or trajectory of actions) based on an observation during rollout. + + If n_action_steps was provided at initialization, this should return a (batch_size, n_action_steps, *) tensor of + actions. Otherwise if n_actions_steps is None, this should return a (batch_size, *) tensor of actions. + """ + raise NotImplementedError("Abstract method") + + def clear_action_queue(self): + """This should be called whenever the environment is reset.""" + if self.n_action_steps is not None: + self._action_queue = deque([], maxlen=self.n_action_steps) + + def forward(self, *args, **kwargs) -> Tensor: + """Inference step that makes multi-step policies compatible with their single-step environments. + + WARNING: In general, this should not be overriden. + + Consider a "policy" that observes the environment then charts a course of N actions to take. To make this fit + into the formalism of a TorchRL environment, we view it as being effectively a policy that (1) makes an + observation and prepares a queue of actions, (2) consumes that queue when queried, regardless of the environment + observation, (3) repopulates the action queue when empty. This method handles the aforementioned logic so that + the subclass doesn't have to. + + This method effectively wraps the `select_actions` method of the subclass. The following assumptions are made: + 1. The `select_actions` method returns a Tensor of actions with shape (B, H, *) where B is the batch size, H is + the action trajectory horizon and * is the action dimensions. + 2. Prior to the `select_actions` method being called, theres is an `n_action_steps` instance attribute defined. + """ + if self.n_action_steps is None: + return self.select_actions(*args, **kwargs) + if len(self._action_queue) == 0: + # `select_actions` returns a (batch_size, n_action_steps, *) tensor, but the queue effectively has shape + # (n_action_steps, batch_size, *), hence the transpose. + self._action_queue.extend(self.select_actions(*args, **kwargs).transpose(0, 1)) + return self._action_queue.popleft() diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index 75d5ca0e..834dd9b2 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -67,7 +67,7 @@ class ActionChunkingTransformerPolicy(nn.Module): def __init__(self, cfg, device, n_action_steps=1): """ - TODO(alexander-soare): Add documentation for all parameters. + TODO(alexander-soare): Add documentation for all parameters once we have model configs established. """ super().__init__() if getattr(cfg, "n_obs_steps", 1) != 1: @@ -109,6 +109,9 @@ class ActionChunkingTransformerPolicy(nn.Module): ) # Backbone for image feature extraction. + self.image_normalizer = transforms.Normalize( + mean=cfg.image_normalization.mean, std=cfg.image_normalization.std + ) backbone_model = getattr(torchvision.models, cfg.backbone)( replace_stride_with_dilation=[False, False, cfg.dilation], pretrained=cfg.pretrained_backbone, @@ -275,9 +278,7 @@ class ActionChunkingTransformerPolicy(nn.Module): return info def forward(self, batch: dict[str, Tensor], return_loss: bool = False): - # TODO(now): Maybe this shouldn't be here? - normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) - images = normalize(batch["observation.images.top"]) + images = self.image_normalizer(batch["observation.images.top"]) if return_loss: # training time actions_hat, (mu_hat, log_sigma_x2_hat) = self._forward( diff --git a/lerobot/common/policies/diffusion/policy.py b/lerobot/common/policies/diffusion/policy.py index 93e5ba5d..9785358b 100644 --- a/lerobot/common/policies/diffusion/policy.py +++ b/lerobot/common/policies/diffusion/policy.py @@ -151,7 +151,6 @@ class DiffusionPolicy(nn.Module): self.diffusion.train() - data_s = time.time() - start_time loss = self.diffusion.compute_loss(batch) loss.backward() @@ -172,7 +171,6 @@ class DiffusionPolicy(nn.Module): "loss": loss.item(), "grad_norm": float(grad_norm), "lr": self.lr_scheduler.get_last_lr()[0], - "data_s": data_s, "update_s": time.time() - start_time, } diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index 80f50003..cd34d115 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -1,6 +1,6 @@ # @package _global_ -offline_steps: 2000 +offline_steps: 80000 online_steps: 0 eval_episodes: 1 @@ -54,8 +54,12 @@ policy: temporal_agg: false - state_dim: ??? - action_dim: ??? + state_dim: 14 + action_dim: 14 + + image_normalization: + mean: [0.485, 0.456, 0.406] + std: [0.229, 0.224, 0.225] delta_timestamps: observation.images.top: [0.0] diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index b43f4ed1..72966211 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -86,7 +86,9 @@ def eval_policy( def maybe_render_frame(env): if save_video: # noqa: B023 if return_first_video: - visu = env.envs[0].render(mode="visualization") + # TODO(now): Put mode back in. + visu = env.envs[0].render() + # visu = env.envs[0].render(mode="visualization") visu = visu[None, ...] # add batch dim else: # TODO(now): Put mode back in. diff --git a/scripts/convert_act_weights.py b/scripts/convert_act_weights.py deleted file mode 100644 index d5e38796..00000000 --- a/scripts/convert_act_weights.py +++ /dev/null @@ -1,71 +0,0 @@ -import torch - -from lerobot.common.policies.factory import make_policy -from lerobot.common.utils import init_hydra_config - -cfg = init_hydra_config( - "/home/alexander/Projects/lerobot/outputs/train/act_aloha_sim_transfer_cube_human/.hydra/config.yaml" -) - -policy = make_policy(cfg) - -state_dict = torch.load("/home/alexander/Projects/act/outputs/sim_transfer_cube_human_vae/policy_last.ckpt") - -# Remove keys based on what they start with. - -start_removals = [ - # There is a bug that means the pretrained model doesn't even use the final decoder layers. - *[f"model.transformer.decoder.layers.{i}" for i in range(1, 7)], - "model.is_pad_head.", -] - -for to_remove in start_removals: - for k in list(state_dict.keys()): - if k.startswith(to_remove): - del state_dict[k] - - -# Replace keys based on what they start with. - -start_replacements = [ - ("model.", ""), - ("query_embed.weight", "pos_embed.weight"), - ("pos_table", "vae_encoder_pos_enc"), - ("pos_embed.weight", "decoder_pos_embed.weight"), - ("encoder.", "vae_encoder."), - ("encoder_action_proj.", "vae_encoder_action_input_proj."), - ("encoder_joint_proj.", "vae_encoder_robot_state_input_proj."), - ("latent_proj.", "vae_encoder_latent_output_proj."), - ("latent_proj.", "vae_encoder_latent_output_proj."), - ("input_proj.", "encoder_img_feat_input_proj."), - ("input_proj_robot_state", "encoder_robot_state_input_proj"), - ("latent_out_proj.", "encoder_latent_input_proj."), - ("transformer.encoder.", "encoder."), - ("transformer.decoder.", "decoder."), - ("backbones.0.0.body.", "backbone."), - ("additional_pos_embed.weight", "encoder_robot_and_latent_pos_embed.weight"), - ("cls_embed.weight", "vae_encoder_cls_embed.weight"), -] - -for to_replace, replace_with in start_replacements: - for k in list(state_dict.keys()): - if k.startswith(to_replace): - k_ = replace_with + k.removeprefix(to_replace) - state_dict[k_] = state_dict[k] - del state_dict[k] - - -missing_keys, unexpected_keys = policy.load_state_dict(state_dict, strict=False) - -if len(missing_keys) != 0: - print("MISSING KEYS") - print(missing_keys) -if len(unexpected_keys) != 0: - print("UNEXPECTED KEYS") - print(unexpected_keys) - -# if len(missing_keys) != 0 or len(unexpected_keys) != 0: -# print("Failed due to mismatch in state dicts.") -# exit() - -policy.save("/tmp/weights.pth") From 86365adf9fd909c5037f0a3a00a0e1d706a44c61 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Mon, 8 Apr 2024 14:44:10 +0100 Subject: [PATCH 13/25] revision --- lerobot/common/envs/factory.py | 12 +++++++----- lerobot/common/policies/act/policy.py | 19 +++++++++++-------- lerobot/configs/policy/act.yaml | 2 +- lerobot/scripts/train.py | 3 +-- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py index 749bb533..bcbdb95d 100644 --- a/lerobot/common/envs/factory.py +++ b/lerobot/common/envs/factory.py @@ -32,12 +32,14 @@ def make_env(cfg, num_parallel_envs=0) -> gym.Env | gym.vector.SyncVectorEnv: elif cfg.env.name == "aloha": import gym_aloha # noqa: F401 - kwargs["task"] = cfg.env.task + if cfg.env.task == "sim_transfer_cube": + env_name = "gym_aloha/AlohaTransferCube-v0" + elif cfg.env.task == "sim_insertion": + env_name = "gym_aloha/AlohaInsertion-v0" + else: + raise ValueError(f"`{cfg.env.task}` has no environment implementation.") - env_fn = lambda: gym.make( # noqa: E731 - "gym_aloha/AlohaTransferCube-v0", - **kwargs, - ) + env_fn = lambda: gym.make(env_name, **kwargs) # noqa: E731 else: raise ValueError(cfg.env.name) diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index 834dd9b2..7fb03576 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -337,18 +337,21 @@ class ActionChunkingTransformerPolicy(nn.Module): robot_state_embed = self.vae_encoder_robot_state_input_proj(robot_state).unsqueeze(1) # (B, 1, D) action_embed = self.vae_encoder_action_input_proj(actions) # (B, S, D) vae_encoder_input = torch.cat([cls_embed, robot_state_embed, action_embed], axis=1) # (B, S+2, D) - # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case. + # Prepare fixed positional embedding. + # Note: detach() shouldn't be necessary but leaving it the same as the original code just in case. pos_embed = self.vae_encoder_pos_enc.clone().detach() # (1, S+2, D) - # Forward pass through VAE encoder and sample the latent with the reparameterization trick. + + # Forward pass through VAE encoder. cls_token_out = self.vae_encoder( vae_encoder_input.permute(1, 0, 2), pos_embed=pos_embed.permute(1, 0, 2) - )[0] # (B, D) + )[0] # select the class token, with shape (B, D) latent_pdf_params = self.vae_encoder_latent_output_proj(cls_token_out) + + # Sample the latent with the reparameterization trick. mu = latent_pdf_params[:, : self.latent_dim] # This is 2log(sigma). Done this way to match the original implementation. log_sigma_x2 = latent_pdf_params[:, self.latent_dim :] - # Use reparameterization trick to sample from the latent's PDF. latent_sample = mu + log_sigma_x2.div(2).exp() * torch.randn_like(mu) else: # When not using the VAE encoder, we set the latent to be all zeros. @@ -469,7 +472,7 @@ class _TransformerEncoderLayer(nn.Module): if self.normalize_before: x = self.norm1(x) q = k = x if pos_embed is None else x + pos_embed - x = self.self_attn(q, k, value=x)[0] + x = self.self_attn(q, k, value=x)[0] # select just the output, not the attention weights x = skip + self.dropout1(x) if self.normalize_before: skip = x @@ -563,7 +566,7 @@ class _TransformerDecoderLayer(nn.Module): if self.normalize_before: x = self.norm1(x) q = k = self.maybe_add_pos_embed(x, decoder_pos_embed) - x = self.self_attn(q, k, value=x)[0] + x = self.self_attn(q, k, value=x)[0] # select just the output, not the attention weights x = skip + self.dropout1(x) if self.normalize_before: skip = x @@ -575,7 +578,7 @@ class _TransformerDecoderLayer(nn.Module): query=self.maybe_add_pos_embed(x, decoder_pos_embed), key=self.maybe_add_pos_embed(encoder_out, encoder_pos_embed), value=encoder_out, - )[0] + )[0] # select just the output, not the attention weights x = skip + self.dropout2(x) if self.normalize_before: skip = x @@ -634,7 +637,7 @@ class _SinusoidalPositionEmbedding2D(nn.Module): Returns: A (1, C, H, W) batch of corresponding sinusoidal positional embeddings. """ - not_mask = torch.ones_like(x[0, [0]]) # (1, H, W) + not_mask = torch.ones_like(x[0, :1]) # (1, H, W) # Note: These are like range(1, H+1) and range(1, W+1) respectively, but in most implementations # they would be range(0, H) and range(0, W). Keeping it at as to match the original code. y_range = not_mask.cumsum(1, dtype=torch.float32) diff --git a/lerobot/configs/policy/act.yaml b/lerobot/configs/policy/act.yaml index cd34d115..79729a02 100644 --- a/lerobot/configs/policy/act.yaml +++ b/lerobot/configs/policy/act.yaml @@ -64,4 +64,4 @@ policy: delta_timestamps: observation.images.top: [0.0] observation.state: [0.0] - action: [0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26, 0.28, 0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 0.76, 0.78, 0.8, 0.82, 0.84, 0.86, 0.88, 0.9, 0.92, 0.94, 0.96, 0.98, 1.0, 1.02, 1.04, 1.06, 1.08, 1.1, 1.12, 1.14, 1.16, 1.18, 1.2, 1.22, 1.24, 1.26, 1.28, 1.3, 1.32, 1.34, 1.36, 1.38, 1.40, 1.42, 1.44, 1.46, 1.48, 1.5, 1.52, 1.54, 1.56, 1.58, 1.6, 1.62, 1.64, 1.66, 1.68, 1.7, 1.72, 1.74, 1.76, 1.78, 1.8, 1.82, 1.84, 1.86, 1.88, 1.90, 1.92, 1.94, 1.96, 1.98] + action: "[i / ${fps} for i in range(${horizon})]" diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index d49dfff8..caaf5182 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -152,7 +152,6 @@ def train(cfg: dict, out_dir=None, job_name=None): logging.info("make_policy") policy = make_policy(cfg) - policy.save("act.pt") num_learnable_params = sum(p.numel() for p in policy.parameters() if p.requires_grad) num_total_params = sum(p.numel() for p in policy.parameters()) @@ -198,7 +197,7 @@ def train(cfg: dict, out_dir=None, job_name=None): is_offline = True dataloader = torch.utils.data.DataLoader( dataset, - num_workers=0, + num_workers=4, batch_size=cfg.policy.batch_size, shuffle=True, pin_memory=cfg.device != "cpu", From 62b18a7607d955eed60ba7eff70b71162f5acaf2 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Mon, 8 Apr 2024 14:51:45 +0100 Subject: [PATCH 14/25] Add type hints --- lerobot/common/policies/act/policy.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index 7fb03576..e14a1e88 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -176,7 +176,7 @@ class ActionChunkingTransformerPolicy(nn.Module): if self.n_action_steps is not None: self._action_queue = deque([], maxlen=self.n_action_steps) - def select_action(self, batch: dict[str, Tensor], *_): + def select_action(self, batch: dict[str, Tensor], *_) -> Tensor: """ This method wraps `select_actions` in order to return one action at a time for execution in the environment. It works by managing the actions in a queue and only calling `select_actions` when the @@ -189,7 +189,7 @@ class ActionChunkingTransformerPolicy(nn.Module): return self._action_queue.popleft() @torch.no_grad() - def select_actions(self, batch: dict[str, Tensor]): + def select_actions(self, batch: dict[str, Tensor]) -> Tensor: """Use the action chunking transformer to generate a sequence of actions.""" self.eval() self._preprocess_batch(batch, add_obs_steps_dim=True) @@ -211,7 +211,7 @@ class ActionChunkingTransformerPolicy(nn.Module): return action[: self.n_action_steps] - def __call__(self, *args, **kwargs): + def __call__(self, *args, **kwargs) -> dict: # TODO(now): Temporary bridge until we know what to do about the `update` method. return self.update(*args, **kwargs) @@ -244,7 +244,7 @@ class ActionChunkingTransformerPolicy(nn.Module): # Note: no squeeze is required for "observation.images.top" because then we'd have to unsqueeze to get # the image index dimension. - def update(self, batch, *_): + def update(self, batch, *_) -> dict: start_time = time.time() self._preprocess_batch(batch) @@ -277,7 +277,7 @@ class ActionChunkingTransformerPolicy(nn.Module): return info - def forward(self, batch: dict[str, Tensor], return_loss: bool = False): + def forward(self, batch: dict[str, Tensor], return_loss: bool = False) -> dict | Tensor: images = self.image_normalizer(batch["observation.images.top"]) if return_loss: # training time @@ -309,7 +309,9 @@ class ActionChunkingTransformerPolicy(nn.Module): action, _ = self._forward(batch["observation.state"], images) return action - def _forward(self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None): + def _forward( + self, robot_state: Tensor, image: Tensor, actions: Tensor | None = None + ) -> tuple[Tensor, tuple[Tensor, Tensor]]: """ Args: robot_state: (B, J) batch of robot joint configurations. @@ -410,7 +412,7 @@ class ActionChunkingTransformerPolicy(nn.Module): actions = self.action_head(decoder_out) - return actions, [mu, log_sigma_x2] + return actions, (mu, log_sigma_x2) def save(self, fp): torch.save(self.state_dict(), fp) From 0b4c42f4ffa6c0efcaf30f8b407789150bc001d2 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Mon, 8 Apr 2024 14:59:37 +0100 Subject: [PATCH 15/25] typos --- lerobot/common/policies/act/policy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lerobot/common/policies/act/policy.py b/lerobot/common/policies/act/policy.py index e14a1e88..b8276214 100644 --- a/lerobot/common/policies/act/policy.py +++ b/lerobot/common/policies/act/policy.py @@ -641,7 +641,7 @@ class _SinusoidalPositionEmbedding2D(nn.Module): """ not_mask = torch.ones_like(x[0, :1]) # (1, H, W) # Note: These are like range(1, H+1) and range(1, W+1) respectively, but in most implementations - # they would be range(0, H) and range(0, W). Keeping it at as to match the original code. + # they would be range(0, H) and range(0, W). Keeping it at as is to match the original code. y_range = not_mask.cumsum(1, dtype=torch.float32) x_range = not_mask.cumsum(2, dtype=torch.float32) @@ -659,7 +659,7 @@ class _SinusoidalPositionEmbedding2D(nn.Module): y_range = y_range.unsqueeze(-1) / inverse_frequency # (1, H, W, 1) # Note: this stack then flatten operation results in interleaved sine and cosine terms. - # pos_embed_x and pos_embed are (1, H, W, C // 2). + # pos_embed_x and pos_embed_y are (1, H, W, C // 2). pos_embed_x = torch.stack((x_range[..., 0::2].sin(), x_range[..., 1::2].cos()), dim=-1).flatten(3) pos_embed_y = torch.stack((y_range[..., 0::2].sin(), y_range[..., 1::2].cos()), dim=-1).flatten(3) pos_embed = torch.cat((pos_embed_y, pos_embed_x), dim=3).permute(0, 3, 1, 2) # (1, C, H, W) From 91e0e4e175236b859cdc463d8d3418b22d9c2ef8 Mon Sep 17 00:00:00 2001 From: Alexander Soare Date: Mon, 8 Apr 2024 15:05:40 +0100 Subject: [PATCH 16/25] rever change --- lerobot/scripts/eval.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lerobot/scripts/eval.py b/lerobot/scripts/eval.py index 72966211..802a2eb6 100644 --- a/lerobot/scripts/eval.py +++ b/lerobot/scripts/eval.py @@ -255,7 +255,7 @@ def eval(cfg: dict, out_dir=None, stats_path=None): transform = make_dataset(cfg, stats_path=stats_path).transform logging.info("Making environment.") - env = make_env(cfg, num_parallel_envs=cfg.rollout_batch_size) + env = make_env(cfg, num_parallel_envs=cfg.eval_episodes) # when policy is None, rollout a random policy policy = make_policy(cfg) if cfg.policy.pretrained_model_path else None From d9019d9e7eae22b3b250ae445f35cae458c82464 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 10:24:28 +0200 Subject: [PATCH 17/25] disable env_checker in factory --- lerobot/common/envs/factory.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lerobot/common/envs/factory.py b/lerobot/common/envs/factory.py index c8d10851..d5571935 100644 --- a/lerobot/common/envs/factory.py +++ b/lerobot/common/envs/factory.py @@ -30,10 +30,13 @@ def make_env(cfg, num_parallel_envs=0) -> gym.Env | gym.vector.SyncVectorEnv: if num_parallel_envs == 0: # non-batched version of the env that returns an observation of shape (c) - env = gym.make(gym_handle, **kwargs) + env = gym.make(gym_handle, disable_env_checker=True, **kwargs) else: # batched version of the env that returns an observation of shape (b, c) env = gym.vector.SyncVectorEnv( - [lambda: gym.make(gym_handle, **kwargs) for _ in range(num_parallel_envs)] + [ + lambda: gym.make(gym_handle, disable_env_checker=True, **kwargs) + for _ in range(num_parallel_envs) + ] ) return env From 274f20b49d018251e1414f9dab98c59ce5a2d23b Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 10:25:41 +0200 Subject: [PATCH 18/25] Update gym-pusht --- poetry.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/poetry.lock b/poetry.lock index 95c9f31e..f712289e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -920,7 +920,7 @@ shapely = "^2.0.3" type = "git" url = "git@github.com:huggingface/gym-pusht.git" reference = "HEAD" -resolved_reference = "6c9893504f670ff069d0f759a733e971ea1efdbf" +resolved_reference = "824b22832cc8d71a4b4e96a57563510cf47e30c1" [[package]] name = "gym-xarm" From 2573e89e1df6136142e883ec23cd066e3a75c657 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 10:38:08 +0200 Subject: [PATCH 19/25] Remove direct dependencies --- .github/poetry/cpu/poetry.lock | 51 ++++++++++++++----------------- .github/poetry/cpu/pyproject.toml | 14 ++++----- poetry.lock | 36 +++++++++++----------- pyproject.toml | 7 +---- 4 files changed, 49 insertions(+), 59 deletions(-) diff --git a/.github/poetry/cpu/poetry.lock b/.github/poetry/cpu/poetry.lock index 15b27c76..edc1d503 100644 --- a/.github/poetry/cpu/poetry.lock +++ b/.github/poetry/cpu/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -517,21 +517,11 @@ files = [ {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, ] -[[package]] -name = "dm" -version = "1.3" -description = "Dict to Data mapper" -optional = false -python-versions = "*" -files = [ - {file = "dm-1.3.tar.gz", hash = "sha256:ce77537bf346b5d8c0dc0b5d679cfc4a946faadcd5315e6c80ef6f3af824130d"}, -] - [[package]] name = "dm-control" version = "1.0.14" description = "Continuous control environments and MuJoCo Python bindings." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "dm_control-1.0.14-py3-none-any.whl", hash = "sha256:883c63244a7ebf598700a97564ed19fffd3479ca79efd090aed881609cdb9fc6"}, @@ -562,7 +552,7 @@ hdf5 = ["h5py"] name = "dm-env" version = "1.6" description = "A Python interface for Reinforcement Learning environments." -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "dm-env-1.6.tar.gz", hash = "sha256:a436eb1c654c39e0c986a516cee218bea7140b510fceff63f97eb4fcff3d93de"}, @@ -578,7 +568,7 @@ numpy = "*" name = "dm-tree" version = "0.1.8" description = "Tree is a library for working with nested data structures." -optional = false +optional = true python-versions = "*" files = [ {file = "dm-tree-0.1.8.tar.gz", hash = "sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430"}, @@ -806,7 +796,7 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre name = "glfw" version = "2.7.0" description = "A ctypes-based wrapper for GLFW3." -optional = false +optional = true python-versions = "*" files = [ {file = "glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-macosx_10_6_intel.whl", hash = "sha256:bd82849edcceda4e262bd1227afaa74b94f9f0731c1197863cd25c15bfc613fc"}, @@ -986,7 +976,7 @@ toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"] name = "gymnasium-robotics" version = "1.2.4" description = "Robotics environments for the Gymnasium repo." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "gymnasium-robotics-1.2.4.tar.gz", hash = "sha256:d304192b066f8b800599dfbe3d9d90bba9b761ee884472bdc4d05968a8bc61cb"}, @@ -1218,7 +1208,7 @@ i18n = ["Babel (>=2.7)"] name = "labmaze" version = "1.0.6" description = "LabMaze: DeepMind Lab's text maze generator." -optional = false +optional = true python-versions = "*" files = [ {file = "labmaze-1.0.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b2ddef976dfd8d992b19cfa6c633f2eba7576d759c2082da534e3f727479a84a"}, @@ -1262,7 +1252,7 @@ setuptools = "!=50.0.0" name = "lazy-loader" version = "0.3" description = "lazy_loader" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"}, @@ -1307,7 +1297,7 @@ files = [ name = "lxml" version = "5.1.0" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = false +optional = true python-versions = ">=3.6" files = [ {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"}, @@ -1525,7 +1515,7 @@ tests = ["pytest (>=4.6)"] name = "mujoco" version = "2.3.7" description = "MuJoCo Physics Simulator" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "mujoco-2.3.7-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:e8714a5ff6a1561b364b7b4648d4c0c8d13e751874cf7401c309b9d23fa9598b"}, @@ -1839,7 +1829,7 @@ xml = ["lxml (>=4.9.2)"] name = "pettingzoo" version = "1.24.3" description = "Gymnasium for multi-agent reinforcement learning." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "pettingzoo-1.24.3-py3-none-any.whl", hash = "sha256:23ed90517d2e8a7098bdaf5e31234b3a7f7b73ca578d70d1ca7b9d0cb0e37982"}, @@ -2207,7 +2197,7 @@ dev = ["aafigure", "matplotlib", "pygame", "pyglet (<2.0.0)", "sphinx", "wheel"] name = "pyopengl" version = "3.1.7" description = "Standard OpenGL bindings for Python" -optional = false +optional = true python-versions = "*" files = [ {file = "PyOpenGL-3.1.7-py3-none-any.whl", hash = "sha256:a6ab19cf290df6101aaf7470843a9c46207789855746399d0af92521a0a92b7a"}, @@ -2218,7 +2208,7 @@ files = [ name = "pyparsing" version = "3.1.2" description = "pyparsing module - Classes and methods to define and execute parsing grammars" -optional = false +optional = true python-versions = ">=3.6.8" files = [ {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, @@ -2649,7 +2639,7 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"] name = "scikit-image" version = "0.22.0" description = "Image processing in Python" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "scikit_image-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:74ec5c1d4693506842cc7c9487c89d8fc32aed064e9363def7af08b8f8cbb31d"}, @@ -2697,7 +2687,7 @@ test = ["asv", "matplotlib (>=3.5)", "numpydoc (>=1.5)", "pooch (>=1.6.0)", "pyt name = "scipy" version = "1.12.0" description = "Fundamental algorithms for scientific computing in Python" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "scipy-1.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:78e4402e140879387187f7f25d91cc592b3501a2e51dfb320f48dfb73565f10b"}, @@ -2902,7 +2892,7 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar name = "shapely" version = "2.0.3" description = "Manipulation and analysis of geometric objects" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "shapely-2.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:af7e9abe180b189431b0f490638281b43b84a33a960620e6b2e8d3e3458b61a1"}, @@ -3069,7 +3059,7 @@ tests = ["pytest", "pytest-cov"] name = "tifffile" version = "2024.2.12" description = "Read and write TIFF files" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "tifffile-2024.2.12-py3-none-any.whl", hash = "sha256:870998f82fbc94ff7c3528884c1b0ae54863504ff51dbebea431ac3fa8fb7c21"}, @@ -3331,7 +3321,12 @@ files = [ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] +[extras] +aloha = ["gym-aloha"] +pusht = ["gym-pusht"] +xarm = ["gym-xarm"] + [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "32cd6caa01276a90b37cb177204e5b1511e92838f3f0268391034042d56f3bd6" +content-hash = "ba2b64b1a683450b097a1ccbed3410cc3bee97ba8b41f409c5e379f95d8b1c6f" diff --git a/.github/poetry/cpu/pyproject.toml b/.github/poetry/cpu/pyproject.toml index d310da47..741e3b37 100644 --- a/.github/poetry/cpu/pyproject.toml +++ b/.github/poetry/cpu/pyproject.toml @@ -23,7 +23,6 @@ packages = [{include = "lerobot"}] python = "^3.10" termcolor = "^2.4.0" omegaconf = "^2.3.0" -dm-env = "^1.6" pandas = "^2.2.1" wandb = "^0.16.3" moviepy = "^1.0.3" @@ -34,21 +33,15 @@ einops = "^0.7.0" pygame = "^2.5.2" pymunk = "^6.6.0" zarr = "^2.17.0" -shapely = "^2.0.3" -scikit-image = "^0.22.0" numba = "^0.59.0" mpmath = "^1.3.0" torch = {version = "^2.2.1", source = "torch-cpu"} -mujoco = "^2.3.7" opencv-python = "^4.9.0.80" diffusers = "^0.26.3" torchvision = {version = "^0.17.1", source = "torch-cpu"} h5py = "^3.10.0" -dm = "^1.3" -dm-control = "1.0.14" robomimic = "0.2.0" huggingface-hub = "^0.21.4" -gymnasium-robotics = "^1.2.4" gymnasium = "^0.29.1" cmake = "^3.29.0.1" gym-pusht = { git = "git@github.com:huggingface/gym-pusht.git", optional = true} @@ -58,6 +51,13 @@ gym-aloha = { git = "git@github.com:huggingface/gym-aloha.git", optional = true} # gym-xarm = { path = "../gym-xarm", develop = true, optional = true} # gym-aloha = { path = "../gym-aloha", develop = true, optional = true} + +[tool.poetry.extras] +pusht = ["gym-pusht"] +xarm = ["gym-xarm"] +aloha = ["gym-aloha"] + + [tool.poetry.group.dev.dependencies] pre-commit = "^3.6.2" debugpy = "^1.8.1" diff --git a/poetry.lock b/poetry.lock index f712289e..b5e97cb7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -521,7 +521,7 @@ files = [ name = "dm-control" version = "1.0.14" description = "Continuous control environments and MuJoCo Python bindings." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "dm_control-1.0.14-py3-none-any.whl", hash = "sha256:883c63244a7ebf598700a97564ed19fffd3479ca79efd090aed881609cdb9fc6"}, @@ -552,7 +552,7 @@ hdf5 = ["h5py"] name = "dm-env" version = "1.6" description = "A Python interface for Reinforcement Learning environments." -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "dm-env-1.6.tar.gz", hash = "sha256:a436eb1c654c39e0c986a516cee218bea7140b510fceff63f97eb4fcff3d93de"}, @@ -568,7 +568,7 @@ numpy = "*" name = "dm-tree" version = "0.1.8" description = "Tree is a library for working with nested data structures." -optional = false +optional = true python-versions = "*" files = [ {file = "dm-tree-0.1.8.tar.gz", hash = "sha256:0fcaabbb14e7980377439e7140bd05552739ca5e515ecb3119f234acee4b9430"}, @@ -796,7 +796,7 @@ test = ["black", "coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre name = "glfw" version = "2.7.0" description = "A ctypes-based wrapper for GLFW3." -optional = false +optional = true python-versions = "*" files = [ {file = "glfw-2.7.0-py2.py27.py3.py30.py31.py32.py33.py34.py35.py36.py37.py38-none-macosx_10_6_intel.whl", hash = "sha256:bd82849edcceda4e262bd1227afaa74b94f9f0731c1197863cd25c15bfc613fc"}, @@ -976,7 +976,7 @@ toy-text = ["pygame (>=2.1.3)", "pygame (>=2.1.3)"] name = "gymnasium-robotics" version = "1.2.4" description = "Robotics environments for the Gymnasium repo." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "gymnasium-robotics-1.2.4.tar.gz", hash = "sha256:d304192b066f8b800599dfbe3d9d90bba9b761ee884472bdc4d05968a8bc61cb"}, @@ -1281,7 +1281,7 @@ i18n = ["Babel (>=2.7)"] name = "labmaze" version = "1.0.6" description = "LabMaze: DeepMind Lab's text maze generator." -optional = false +optional = true python-versions = "*" files = [ {file = "labmaze-1.0.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:b2ddef976dfd8d992b19cfa6c633f2eba7576d759c2082da534e3f727479a84a"}, @@ -1325,7 +1325,7 @@ setuptools = "!=50.0.0" name = "lazy-loader" version = "0.3" description = "lazy_loader" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"}, @@ -1370,7 +1370,7 @@ files = [ name = "lxml" version = "5.1.0" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." -optional = false +optional = true python-versions = ">=3.6" files = [ {file = "lxml-5.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:704f5572ff473a5f897745abebc6df40f22d4133c1e0a1f124e4f2bd3330ff7e"}, @@ -1588,7 +1588,7 @@ tests = ["pytest (>=4.6)"] name = "mujoco" version = "2.3.7" description = "MuJoCo Physics Simulator" -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "mujoco-2.3.7-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:e8714a5ff6a1561b364b7b4648d4c0c8d13e751874cf7401c309b9d23fa9598b"}, @@ -2043,7 +2043,7 @@ xml = ["lxml (>=4.9.2)"] name = "pettingzoo" version = "1.24.3" description = "Gymnasium for multi-agent reinforcement learning." -optional = false +optional = true python-versions = ">=3.8" files = [ {file = "pettingzoo-1.24.3-py3-none-any.whl", hash = "sha256:23ed90517d2e8a7098bdaf5e31234b3a7f7b73ca578d70d1ca7b9d0cb0e37982"}, @@ -2411,7 +2411,7 @@ dev = ["aafigure", "matplotlib", "pygame", "pyglet (<2.0.0)", "sphinx", "wheel"] name = "pyopengl" version = "3.1.7" description = "Standard OpenGL bindings for Python" -optional = false +optional = true python-versions = "*" files = [ {file = "PyOpenGL-3.1.7-py3-none-any.whl", hash = "sha256:a6ab19cf290df6101aaf7470843a9c46207789855746399d0af92521a0a92b7a"}, @@ -2422,7 +2422,7 @@ files = [ name = "pyparsing" version = "3.1.2" description = "pyparsing module - Classes and methods to define and execute parsing grammars" -optional = false +optional = true python-versions = ">=3.6.8" files = [ {file = "pyparsing-3.1.2-py3-none-any.whl", hash = "sha256:f9db75911801ed778fe61bb643079ff86601aca99fcae6345aa67292038fb742"}, @@ -2853,7 +2853,7 @@ torch = ["safetensors[numpy]", "torch (>=1.10)"] name = "scikit-image" version = "0.22.0" description = "Image processing in Python" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "scikit_image-0.22.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:74ec5c1d4693506842cc7c9487c89d8fc32aed064e9363def7af08b8f8cbb31d"}, @@ -2901,7 +2901,7 @@ test = ["asv", "matplotlib (>=3.5)", "numpydoc (>=1.5)", "pooch (>=1.6.0)", "pyt name = "scipy" version = "1.12.0" description = "Fundamental algorithms for scientific computing in Python" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "scipy-1.12.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:78e4402e140879387187f7f25d91cc592b3501a2e51dfb320f48dfb73565f10b"}, @@ -3106,7 +3106,7 @@ testing-integration = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "jar name = "shapely" version = "2.0.3" description = "Manipulation and analysis of geometric objects" -optional = false +optional = true python-versions = ">=3.7" files = [ {file = "shapely-2.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:af7e9abe180b189431b0f490638281b43b84a33a960620e6b2e8d3e3458b61a1"}, @@ -3273,7 +3273,7 @@ tests = ["pytest", "pytest-cov"] name = "tifffile" version = "2024.2.12" description = "Read and write TIFF files" -optional = false +optional = true python-versions = ">=3.9" files = [ {file = "tifffile-2024.2.12-py3-none-any.whl", hash = "sha256:870998f82fbc94ff7c3528884c1b0ae54863504ff51dbebea431ac3fa8fb7c21"}, @@ -3598,4 +3598,4 @@ xarm = ["gym-xarm"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "bf4627c62a45764931729ce373f1038fe289b6caebb01e66d878f6f278c54518" +content-hash = "d444fab7fed5e3c5c9cde69c8f19a286126615ab4a9de11c23730b5286cac77b" diff --git a/pyproject.toml b/pyproject.toml index a549e66f..75342c80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,6 @@ packages = [{include = "lerobot"}] python = "^3.10" termcolor = "^2.4.0" omegaconf = "^2.3.0" -dm-env = "^1.6" pandas = "^2.2.1" wandb = "^0.16.3" moviepy = "^1.0.3" @@ -34,20 +33,15 @@ einops = "^0.7.0" pygame = "^2.5.2" pymunk = "^6.6.0" zarr = "^2.17.0" -shapely = "^2.0.3" -scikit-image = "^0.22.0" numba = "^0.59.0" mpmath = "^1.3.0" torch = "^2.2.1" -mujoco = "^2.3.7" opencv-python = "^4.9.0.80" diffusers = "^0.26.3" torchvision = "^0.17.1" h5py = "^3.10.0" -dm-control = "1.0.14" huggingface-hub = {extras = ["hf-transfer"], version = "^0.21.4"} robomimic = "0.2.0" -gymnasium-robotics = "^1.2.4" gymnasium = "^0.29.1" cmake = "^3.29.0.1" gym-pusht = { git = "git@github.com:huggingface/gym-pusht.git", optional = true} @@ -62,6 +56,7 @@ pusht = ["gym-pusht"] xarm = ["gym-xarm"] aloha = ["gym-aloha"] + [tool.poetry.group.dev.dependencies] pre-commit = "^3.6.2" debugpy = "^1.8.1" From dfaacbcf5a7bf4d75a39d6ad8bac8a75291e8cb5 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 10:40:11 +0200 Subject: [PATCH 20/25] Split dev/test dependencies --- .github/poetry/cpu/poetry.lock | 2 +- .github/poetry/cpu/pyproject.toml | 7 +++++++ poetry.lock | 2 +- pyproject.toml | 7 +++++++ 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/poetry/cpu/poetry.lock b/.github/poetry/cpu/poetry.lock index edc1d503..fe4ed7a0 100644 --- a/.github/poetry/cpu/poetry.lock +++ b/.github/poetry/cpu/poetry.lock @@ -3329,4 +3329,4 @@ xarm = ["gym-xarm"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "ba2b64b1a683450b097a1ccbed3410cc3bee97ba8b41f409c5e379f95d8b1c6f" +content-hash = "8fa6dfc30e605741c24f5de58b89125d5b02153f550e5af7a44356956d6bb167" diff --git a/.github/poetry/cpu/pyproject.toml b/.github/poetry/cpu/pyproject.toml index 741e3b37..f5c439dc 100644 --- a/.github/poetry/cpu/pyproject.toml +++ b/.github/poetry/cpu/pyproject.toml @@ -58,9 +58,16 @@ xarm = ["gym-xarm"] aloha = ["gym-aloha"] +[tool.poetry.group.dev] +optional = true + + [tool.poetry.group.dev.dependencies] pre-commit = "^3.6.2" debugpy = "^1.8.1" + + +[tool.poetry.group.test.dependencies] pytest = "^8.1.0" pytest-cov = "^5.0.0" diff --git a/poetry.lock b/poetry.lock index b5e97cb7..387366b8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3598,4 +3598,4 @@ xarm = ["gym-xarm"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "d444fab7fed5e3c5c9cde69c8f19a286126615ab4a9de11c23730b5286cac77b" +content-hash = "7ec0310f8dd0ffa4d92fa78e06513bce98c3657692b3753ff34aadd297a3766c" diff --git a/pyproject.toml b/pyproject.toml index 75342c80..a0fc7d44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,9 +57,16 @@ xarm = ["gym-xarm"] aloha = ["gym-aloha"] +[tool.poetry.group.dev] +optional = true + + [tool.poetry.group.dev.dependencies] pre-commit = "^3.6.2" debugpy = "^1.8.1" + + +[tool.poetry.group.test.dependencies] pytest = "^8.1.0" pytest-cov = "^5.0.0" From d21543eb4fcced5bcf717dca62155d53a4c2bc87 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 10:41:20 +0200 Subject: [PATCH 21/25] Add env.close() --- tests/test_envs.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_envs.py b/tests/test_envs.py index 72bc93c4..c49461a0 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -34,7 +34,7 @@ def test_env(env_name, task, obs_type): importlib.import_module(package_name) env = gym.make(f"{package_name}/{task}", obs_type=obs_type) check_env(env.unwrapped) - + env.close() @pytest.mark.parametrize( "env_name", @@ -61,3 +61,5 @@ def test_factory(env_name): # TODO(rcadene): we assume for now that image normalization takes place in the model assert img.max() <= 1.0 assert img.min() >= 0.0 + + env.close() From dba037508997dae6fe3ba16a7886714ca1759bef Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 10:45:58 +0200 Subject: [PATCH 22/25] Fix CI --- .github/workflows/test.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c1b14780..34d76827 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -87,7 +87,7 @@ jobs: TMP: ~/tmp run: | mkdir ~/tmp - poetry install --no-interaction --no-root + poetry install --no-interaction --no-root --all-extras - name: Save cached venv if: | @@ -106,7 +106,7 @@ jobs: # install project #---------------------------------------------- - name: Install project - run: poetry install --no-interaction + run: poetry install --no-interaction --all-extras #---------------------------------------------- # run tests & coverage From d44950e020c7e658fa6da19063c40379801080e8 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 11:44:55 +0200 Subject: [PATCH 23/25] Add ssh key --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 34d76827..afdcc41f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -34,6 +34,11 @@ jobs: with: python-version: '3.10' + - name: Add SSH key for installing envs + uses: webfactory/ssh-agent@v0.9.0 + with: + ssh-private-key: ${{ secrets.SSH_PRIVATE_KEY }} + #---------------------------------------------- # install & configure poetry #---------------------------------------------- From 7f4ff0b170091288bae65281df093768de562f13 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 11:58:59 +0200 Subject: [PATCH 24/25] CI fix attempt --- tests/test_envs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_envs.py b/tests/test_envs.py index c49461a0..d25231b0 100644 --- a/tests/test_envs.py +++ b/tests/test_envs.py @@ -33,7 +33,7 @@ def test_env(env_name, task, obs_type): package_name = f"gym_{env_name}" importlib.import_module(package_name) env = gym.make(f"{package_name}/{task}", obs_type=obs_type) - check_env(env.unwrapped) + check_env(env.unwrapped, skip_render_check=True) env.close() @pytest.mark.parametrize( From 91ff69d64c9d91f072b2c5fd33999b9056e2e466 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Tue, 9 Apr 2024 17:08:36 +0200 Subject: [PATCH 25/25] Update gym_xarm --- poetry.lock | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/poetry.lock b/poetry.lock index f0e77c33..faeb70f1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -941,7 +941,7 @@ mujoco = "^2.3.7" type = "git" url = "git@github.com:huggingface/gym-xarm.git" reference = "HEAD" -resolved_reference = "08ddd5a9400783a6898bbf3c3014fc5da3961b9d" +resolved_reference = "ce294c0d30def08414d9237e2bf9f373d448ca07" [[package]] name = "gymnasium" @@ -1329,16 +1329,12 @@ description = "lazy_loader" optional = true python-versions = ">=3.7" files = [ - {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"}, - {file = "lazy_loader-0.4.tar.gz", hash = "sha256:47c75182589b91a4e1a85a136c074285a5ad4d9f39c63e0d7fb76391c4574cd1"}, + {file = "lazy_loader-0.3-py3-none-any.whl", hash = "sha256:1e9e76ee8631e264c62ce10006718e80b2cfc74340d17d1031e0f84af7478554"}, + {file = "lazy_loader-0.3.tar.gz", hash = "sha256:3b68898e34f5b2a29daaaac172c6555512d0f32074f147e2254e4a6d9d838f37"}, ] -[package.dependencies] -packaging = "*" - [package.extras] -dev = ["changelist (==0.5)"] -lint = ["pre-commit (==3.7.0)"] +lint = ["pre-commit (>=3.3)"] test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"] [[package]]