diff --git a/lerobot/common/policies/dexvla/policy_heads/modeling_scaledp.py b/lerobot/common/policies/dexvla/policy_heads/modeling_scaledp.py index ed853106..ba5766fa 100644 --- a/lerobot/common/policies/dexvla/policy_heads/modeling_scaledp.py +++ b/lerobot/common/policies/dexvla/policy_heads/modeling_scaledp.py @@ -5,7 +5,6 @@ import math from typing import Tuple import numpy as np - import torch import torch.nn as nn import torch.nn.functional as func @@ -158,8 +157,10 @@ class ScaleDPBlock(nn.Module): self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs) self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6) mlp_hidden_dim = int(hidden_size * mlp_ratio) + def approx_gelu(): return nn.GELU(approximate="tanh") + self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0) self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True))