2025-03-19 00:28:09 +08:00
|
|
|
#!/usr/bin/env python
|
|
|
|
|
|
|
|
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import logging
|
|
|
|
import time
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
import numpy as np
|
|
|
|
import tensorflow_datasets as tfds
|
|
|
|
|
|
|
|
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
|
|
|
|
from lerobot.common.utils.utils import get_elapsed_time_in_days_hours_minutes_seconds
|
|
|
|
|
|
|
|
DROID_SHARDS = 2048
|
|
|
|
DROID_FPS = 15
|
|
|
|
DROID_ROBOT_TYPE = "Franka"
|
|
|
|
|
|
|
|
# Dataset schema slightly adapted from: https://droid-dataset.github.io/droid/the-droid-dataset.html#-dataset-schema
|
|
|
|
DROID_FEATURES = {
|
|
|
|
# true on first step of the episode
|
|
|
|
"is_first": {
|
|
|
|
"dtype": "bool",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
# true on last step of the episode
|
|
|
|
"is_last": {
|
|
|
|
"dtype": "bool",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
# true on last step of the episode if it is a terminal step, True for demos
|
|
|
|
"is_terminal": {
|
|
|
|
"dtype": "bool",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
# language_instruction is also stored as "task" to follow LeRobot standard
|
|
|
|
"language_instruction": {
|
|
|
|
"dtype": "string",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"language_instruction_2": {
|
|
|
|
"dtype": "string",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"language_instruction_3": {
|
|
|
|
"dtype": "string",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"observation.state.gripper_position": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["gripper"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"observation.state.cartesian_position": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (6,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"observation.state.joint_position": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (7,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
# Add this new feature to follow LeRobot standard of using joint position + gripper
|
|
|
|
"observation.state": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (8,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6", "gripper"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
# Initially called wrist_image_left
|
|
|
|
"observation.images.wrist_left": {
|
|
|
|
"dtype": "video",
|
|
|
|
"shape": (180, 320, 3),
|
|
|
|
"names": [
|
|
|
|
"height",
|
|
|
|
"width",
|
|
|
|
"channels",
|
|
|
|
],
|
|
|
|
},
|
|
|
|
# Initially called exterior_image_1_left
|
|
|
|
"observation.images.exterior_1_left": {
|
|
|
|
"dtype": "video",
|
|
|
|
"shape": (180, 320, 3),
|
|
|
|
"names": [
|
|
|
|
"height",
|
|
|
|
"width",
|
|
|
|
"channels",
|
|
|
|
],
|
|
|
|
},
|
|
|
|
# Initially called exterior_image_2_left
|
|
|
|
"observation.images.exterior_2_left": {
|
|
|
|
"dtype": "video",
|
|
|
|
"shape": (180, 320, 3),
|
|
|
|
"names": [
|
|
|
|
"height",
|
|
|
|
"width",
|
|
|
|
"channels",
|
|
|
|
],
|
|
|
|
},
|
|
|
|
"action.gripper_position": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["gripper"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"action.gripper_velocity": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["gripper"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"action.cartesian_position": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (6,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"action.cartesian_velocity": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (6,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"action.joint_position": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (7,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"action.joint_velocity": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (7,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
# This feature was called "action" in RLDS dataset and consists of [6x joint velocities, 1x gripper position]
|
|
|
|
"action.original": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (7,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw", "gripper"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
# Add this new feature to follow LeRobot standard of using joint position + gripper
|
|
|
|
"action": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (8,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6", "gripper"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"discount": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"reward": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
# Meta data that are the same for all frames in the episode
|
|
|
|
"task_category": {
|
|
|
|
"dtype": "string",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"building": {
|
|
|
|
"dtype": "string",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"collector_id": {
|
|
|
|
"dtype": "string",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"date": {
|
|
|
|
"dtype": "string",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
"camera_extrinsics.wrist_left": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (6,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"camera_extrinsics.exterior_1_left": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (6,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"camera_extrinsics.exterior_2_left": {
|
|
|
|
"dtype": "float32",
|
|
|
|
"shape": (6,),
|
|
|
|
"names": {
|
|
|
|
"axes": ["x", "y", "z", "roll", "pitch", "yaw"],
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"is_episode_successful": {
|
|
|
|
"dtype": "bool",
|
|
|
|
"shape": (1,),
|
|
|
|
"names": None,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def is_episode_successful(tf_episode_metadata):
|
|
|
|
# Adapted from: https://github.com/droid-dataset/droid_policy_learning/blob/dd1020eb20d981f90b5ff07dc80d80d5c0cb108b/robomimic/utils/rlds_utils.py#L8
|
|
|
|
return "/success/" in tf_episode_metadata["file_path"].numpy().decode()
|
|
|
|
|
|
|
|
|
|
|
|
def generate_lerobot_frames(tf_episode):
|
|
|
|
m = tf_episode["episode_metadata"]
|
|
|
|
frame_meta = {
|
|
|
|
"task_category": m["building"].numpy().decode(),
|
|
|
|
"building": m["building"].numpy().decode(),
|
|
|
|
"collector_id": m["collector_id"].numpy().decode(),
|
|
|
|
"date": m["date"].numpy().decode(),
|
|
|
|
"camera_extrinsics.wrist_left": m["extrinsics_wrist_cam"].numpy(),
|
|
|
|
"camera_extrinsics.exterior_1_left": m["extrinsics_exterior_cam_1"].numpy(),
|
|
|
|
"camera_extrinsics.exterior_2_left": m["extrinsics_exterior_cam_2"].numpy(),
|
|
|
|
"is_episode_successful": np.array([is_episode_successful(m)]),
|
|
|
|
}
|
|
|
|
for f in tf_episode["steps"]:
|
|
|
|
# Dataset schema slightly adapted from: https://droid-dataset.github.io/droid/the-droid-dataset.html#-dataset-schema
|
|
|
|
frame = {
|
|
|
|
"is_first": np.array([f["is_first"].numpy()]),
|
|
|
|
"is_last": np.array([f["is_last"].numpy()]),
|
|
|
|
"is_terminal": np.array([f["is_terminal"].numpy()]),
|
|
|
|
"language_instruction": f["language_instruction"].numpy().decode(),
|
|
|
|
"language_instruction_2": f["language_instruction_2"].numpy().decode(),
|
|
|
|
"language_instruction_3": f["language_instruction_3"].numpy().decode(),
|
|
|
|
"observation.state.gripper_position": f["observation"]["gripper_position"].numpy(),
|
|
|
|
"observation.state.cartesian_position": f["observation"]["cartesian_position"].numpy(),
|
|
|
|
"observation.state.joint_position": f["observation"]["joint_position"].numpy(),
|
|
|
|
"observation.images.wrist_left": f["observation"]["wrist_image_left"].numpy(),
|
|
|
|
"observation.images.exterior_1_left": f["observation"]["exterior_image_1_left"].numpy(),
|
|
|
|
"observation.images.exterior_2_left": f["observation"]["exterior_image_2_left"].numpy(),
|
|
|
|
"action.gripper_position": f["action_dict"]["gripper_position"].numpy(),
|
|
|
|
"action.gripper_velocity": f["action_dict"]["gripper_velocity"].numpy(),
|
|
|
|
"action.cartesian_position": f["action_dict"]["cartesian_position"].numpy(),
|
|
|
|
"action.cartesian_velocity": f["action_dict"]["cartesian_velocity"].numpy(),
|
|
|
|
"action.joint_position": f["action_dict"]["joint_position"].numpy(),
|
|
|
|
"action.joint_velocity": f["action_dict"]["joint_velocity"].numpy(),
|
|
|
|
"discount": np.array([f["discount"].numpy()]),
|
|
|
|
"reward": np.array([f["reward"].numpy()]),
|
|
|
|
"action.original": f["action"].numpy(),
|
|
|
|
}
|
|
|
|
|
|
|
|
# language_instruction is also stored as "task" to follow LeRobot standard
|
|
|
|
frame["task"] = frame["language_instruction"]
|
|
|
|
|
|
|
|
# Add this new feature to follow LeRobot standard of using joint position + gripper
|
|
|
|
frame["observation.state"] = np.concatenate(
|
|
|
|
[frame["observation.state.joint_position"], frame["observation.state.gripper_position"]]
|
|
|
|
)
|
|
|
|
frame["action"] = np.concatenate([frame["action.joint_position"], frame["action.gripper_position"]])
|
|
|
|
|
|
|
|
# Meta data that are the same for all frames in the episode
|
|
|
|
frame.update(frame_meta)
|
|
|
|
|
|
|
|
# Cast fp64 to fp32
|
|
|
|
for key in frame:
|
|
|
|
if isinstance(frame[key], np.ndarray) and frame[key].dtype == np.float64:
|
|
|
|
frame[key] = frame[key].astype(np.float32)
|
|
|
|
|
|
|
|
yield frame
|
|
|
|
|
|
|
|
|
|
|
|
def port_droid(
|
|
|
|
raw_dir: Path,
|
2025-03-20 22:12:46 +08:00
|
|
|
repo_id: str,
|
2025-03-19 00:28:09 +08:00
|
|
|
push_to_hub: bool = False,
|
|
|
|
num_shards: int | None = None,
|
|
|
|
shard_index: int | None = None,
|
|
|
|
):
|
|
|
|
dataset_name = raw_dir.parent.name
|
|
|
|
version = raw_dir.name
|
|
|
|
data_dir = raw_dir.parent.parent
|
|
|
|
|
|
|
|
builder = tfds.builder(f"{dataset_name}/{version}", data_dir=data_dir, version="")
|
|
|
|
|
|
|
|
if num_shards is not None:
|
|
|
|
tfds_num_shards = builder.info.splits["train"].num_shards
|
|
|
|
if tfds_num_shards != DROID_SHARDS:
|
|
|
|
raise ValueError(
|
|
|
|
f"Number of shards of Droid dataset is expected to be {DROID_SHARDS} but is {tfds_num_shards}."
|
|
|
|
)
|
|
|
|
if num_shards != tfds_num_shards:
|
|
|
|
raise ValueError(
|
|
|
|
f"We only shard over the fixed number of shards provided by tensorflow dataset ({tfds_num_shards}), but {num_shards} shards provided instead."
|
|
|
|
)
|
|
|
|
if shard_index >= tfds_num_shards:
|
|
|
|
raise ValueError(
|
|
|
|
f"Shard index is greater than the num of shards ({shard_index} >= {num_shards})."
|
|
|
|
)
|
|
|
|
|
|
|
|
raw_dataset = builder.as_dataset(split=f"train[{shard_index}shard]")
|
|
|
|
else:
|
|
|
|
raw_dataset = builder.as_dataset(split="train")
|
|
|
|
|
|
|
|
lerobot_dataset = LeRobotDataset.create(
|
|
|
|
repo_id=repo_id,
|
|
|
|
robot_type=DROID_ROBOT_TYPE,
|
|
|
|
fps=DROID_FPS,
|
|
|
|
features=DROID_FEATURES,
|
|
|
|
)
|
|
|
|
|
|
|
|
start_time = time.time()
|
|
|
|
num_episodes = raw_dataset.cardinality().numpy().item()
|
|
|
|
logging.info(f"Number of episodes {num_episodes}")
|
|
|
|
|
|
|
|
for episode_index, episode in enumerate(raw_dataset):
|
|
|
|
elapsed_time = time.time() - start_time
|
|
|
|
d, h, m, s = get_elapsed_time_in_days_hours_minutes_seconds(elapsed_time)
|
2025-03-20 22:12:46 +08:00
|
|
|
|
|
|
|
logging.info(
|
|
|
|
f"{episode_index} / {num_episodes} episodes processed (after {d} days, {h} hours, {m} minutes, {s:.3f} seconds)"
|
|
|
|
)
|
2025-03-19 00:28:09 +08:00
|
|
|
|
|
|
|
for frame in generate_lerobot_frames(episode):
|
|
|
|
lerobot_dataset.add_frame(frame)
|
|
|
|
|
|
|
|
lerobot_dataset.save_episode()
|
|
|
|
logging.info("Save_episode")
|
|
|
|
|
|
|
|
if push_to_hub:
|
|
|
|
lerobot_dataset.push_to_hub(
|
|
|
|
# Add openx tag, since it belongs to the openx collection of datasets
|
|
|
|
tags=["openx"],
|
|
|
|
private=False,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
|
|
|
parser.add_argument(
|
|
|
|
"--raw-dir",
|
|
|
|
type=Path,
|
|
|
|
required=True,
|
|
|
|
help="Directory containing input raw datasets (e.g. `path/to/dataset` or `path/to/dataset/version).",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--repo-id",
|
|
|
|
type=str,
|
|
|
|
help="Repositery identifier on Hugging Face: a community or a user name `/` the name of the dataset, required when push-to-hub is True",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--push-to-hub",
|
|
|
|
action="store_true",
|
|
|
|
help="Upload to hub.",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--num-shards",
|
|
|
|
type=int,
|
|
|
|
default=None,
|
|
|
|
help="Number of shards. Can be either None to load the full dataset, or 2048 to load one of the 2048 tensorflow dataset files.",
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
"--shard-index",
|
|
|
|
type=int,
|
|
|
|
default=None,
|
|
|
|
help="Index of the shard. Can be either None to load the full dataset, or in [0,2047] to load one of the 2048 tensorflow dataset files.",
|
|
|
|
)
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
port_droid(**vars(args))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|