lerobot/examples/port_datasets/droid_rlds/port_droid.py

#!/usr/bin/env python

# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import logging
import time
from pathlib import Path

import numpy as np
import tensorflow_datasets as tfds

from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.utils.utils import get_elapsed_time_in_days_hours_minutes_seconds

DROID_SHARDS = 2048
DROID_FPS = 15
DROID_ROBOT_TYPE = "Franka"

# Dataset schema slightly adapted from: https://droid-dataset.github.io/droid/the-droid-dataset.html#-dataset-schema
DROID_FEATURES = {
    # true on first step of the episode
    "is_first": {
        "dtype": "bool",
        "shape": (1,),
        "names": None,
    },
    # true on last step of the episode
    "is_last": {
        "dtype": "bool",
        "shape": (1,),
        "names": None,
    },
    # true on last step of the episode if it is a terminal step, True for demos
    "is_terminal": {
        "dtype": "bool",
        "shape": (1,),
        "names": None,
    },
    # language_instruction is also stored as "task" to follow LeRobot standard
    "language_instruction": {
        "dtype": "string",
        "shape": (1,),
        "names": None,
    },
    "language_instruction_2": {
        "dtype": "string",
        "shape": (1,),
        "names": None,
    },
    "language_instruction_3": {
        "dtype": "string",
        "shape": (1,),
        "names": None,
    },
    "observation.state.gripper_position": {
        "dtype": "float32",
        "shape": (1,),
        "names": {
            "axes": ["gripper"],
        },
    },
    "observation.state.cartesian_position": {
        "dtype": "float32",
        "shape": (6,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw"],
        },
    },
    "observation.state.joint_position": {
        "dtype": "float32",
        "shape": (7,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw"],
        },
    },
    # Add this new feature to follow LeRobot standard of using joint position + gripper
    "observation.state": {
        "dtype": "float32",
        "shape": (8,),
        "names": {
            "axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6", "gripper"],
        },
    },
    # Initially called wrist_image_left
    "observation.images.wrist_left": {
        "dtype": "video",
        "shape": (180, 320, 3),
        "names": [
            "height",
            "width",
            "channels",
        ],
    },
    # Initially called exterior_image_1_left
    "observation.images.exterior_1_left": {
        "dtype": "video",
        "shape": (180, 320, 3),
        "names": [
            "height",
            "width",
            "channels",
        ],
    },
    # Initially called exterior_image_2_left
    "observation.images.exterior_2_left": {
        "dtype": "video",
        "shape": (180, 320, 3),
        "names": [
            "height",
            "width",
            "channels",
        ],
    },
    "action.gripper_position": {
        "dtype": "float32",
        "shape": (1,),
        "names": {
            "axes": ["gripper"],
        },
    },
    "action.gripper_velocity": {
        "dtype": "float32",
        "shape": (1,),
        "names": {
            "axes": ["gripper"],
        },
    },
    "action.cartesian_position": {
        "dtype": "float32",
        "shape": (6,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw"],
        },
    },
    "action.cartesian_velocity": {
        "dtype": "float32",
        "shape": (6,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw"],
        },
    },
    "action.joint_position": {
        "dtype": "float32",
        "shape": (7,),
        "names": {
            "axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6"],
        },
    },
    "action.joint_velocity": {
        "dtype": "float32",
        "shape": (7,),
        "names": {
            "axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6"],
        },
    },
    # This feature was called "action" in RLDS dataset and consists of [6x joint velocities, 1x gripper position]
    "action.original": {
        "dtype": "float32",
        "shape": (7,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw", "gripper"],
        },
    },
    # Add this new feature to follow LeRobot standard of using joint position + gripper
    "action": {
        "dtype": "float32",
        "shape": (8,),
        "names": {
            "axes": ["joint_0", "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6", "gripper"],
        },
    },
    "discount": {
        "dtype": "float32",
        "shape": (1,),
        "names": None,
    },
    "reward": {
        "dtype": "float32",
        "shape": (1,),
        "names": None,
    },
    # Meta data that are the same for all frames in the episode
    "task_category": {
        "dtype": "string",
        "shape": (1,),
        "names": None,
    },
    "building": {
        "dtype": "string",
        "shape": (1,),
        "names": None,
    },
    "collector_id": {
        "dtype": "string",
        "shape": (1,),
        "names": None,
    },
    "date": {
        "dtype": "string",
        "shape": (1,),
        "names": None,
    },
    "camera_extrinsics.wrist_left": {
        "dtype": "float32",
        "shape": (6,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw"],
        },
    },
    "camera_extrinsics.exterior_1_left": {
        "dtype": "float32",
        "shape": (6,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw"],
        },
    },
    "camera_extrinsics.exterior_2_left": {
        "dtype": "float32",
        "shape": (6,),
        "names": {
            "axes": ["x", "y", "z", "roll", "pitch", "yaw"],
        },
    },
    "is_episode_successful": {
        "dtype": "bool",
        "shape": (1,),
        "names": None,
    },
}


def is_episode_successful(tf_episode_metadata):
    # Adapted from: https://github.com/droid-dataset/droid_policy_learning/blob/dd1020eb20d981f90b5ff07dc80d80d5c0cb108b/robomimic/utils/rlds_utils.py#L8
    return "/success/" in tf_episode_metadata["file_path"].numpy().decode()


def generate_lerobot_frames(tf_episode):
    m = tf_episode["episode_metadata"]
    frame_meta = {
        "task_category": m["building"].numpy().decode(),
        "building": m["building"].numpy().decode(),
        "collector_id": m["collector_id"].numpy().decode(),
        "date": m["date"].numpy().decode(),
        "camera_extrinsics.wrist_left": m["extrinsics_wrist_cam"].numpy(),
        "camera_extrinsics.exterior_1_left": m["extrinsics_exterior_cam_1"].numpy(),
        "camera_extrinsics.exterior_2_left": m["extrinsics_exterior_cam_2"].numpy(),
        "is_episode_successful": np.array([is_episode_successful(m)]),
    }
    for f in tf_episode["steps"]:
        # Dataset schema slightly adapted from: https://droid-dataset.github.io/droid/the-droid-dataset.html#-dataset-schema
        frame = {
            "is_first": np.array([f["is_first"].numpy()]),
            "is_last": np.array([f["is_last"].numpy()]),
            "is_terminal": np.array([f["is_terminal"].numpy()]),
            "language_instruction": f["language_instruction"].numpy().decode(),
            "language_instruction_2": f["language_instruction_2"].numpy().decode(),
            "language_instruction_3": f["language_instruction_3"].numpy().decode(),
            "observation.state.gripper_position": f["observation"]["gripper_position"].numpy(),
            "observation.state.cartesian_position": f["observation"]["cartesian_position"].numpy(),
            "observation.state.joint_position": f["observation"]["joint_position"].numpy(),
            "observation.images.wrist_left": f["observation"]["wrist_image_left"].numpy(),
            "observation.images.exterior_1_left": f["observation"]["exterior_image_1_left"].numpy(),
            "observation.images.exterior_2_left": f["observation"]["exterior_image_2_left"].numpy(),
            "action.gripper_position": f["action_dict"]["gripper_position"].numpy(),
            "action.gripper_velocity": f["action_dict"]["gripper_velocity"].numpy(),
            "action.cartesian_position": f["action_dict"]["cartesian_position"].numpy(),
            "action.cartesian_velocity": f["action_dict"]["cartesian_velocity"].numpy(),
            "action.joint_position": f["action_dict"]["joint_position"].numpy(),
            "action.joint_velocity": f["action_dict"]["joint_velocity"].numpy(),
            "discount": np.array([f["discount"].numpy()]),
            "reward": np.array([f["reward"].numpy()]),
            "action.original": f["action"].numpy(),
        }

        # language_instruction is also stored as "task" to follow LeRobot standard
        frame["task"] = frame["language_instruction"]

        # Add this new feature to follow LeRobot standard of using joint position + gripper
        frame["observation.state"] = np.concatenate(
            [frame["observation.state.joint_position"], frame["observation.state.gripper_position"]]
        )
        frame["action"] = np.concatenate([frame["action.joint_position"], frame["action.gripper_position"]])

        # Meta data that are the same for all frames in the episode
        frame.update(frame_meta)

        # Cast fp64 to fp32
        for key in frame:
            if isinstance(frame[key], np.ndarray) and frame[key].dtype == np.float64:
                frame[key] = frame[key].astype(np.float32)

        yield frame


def port_droid(
    raw_dir: Path,
    repo_id: str,
    push_to_hub: bool = False,
    num_shards: int | None = None,
    shard_index: int | None = None,
):
    dataset_name = raw_dir.parent.name
    version = raw_dir.name
    data_dir = raw_dir.parent.parent

    builder = tfds.builder(f"{dataset_name}/{version}", data_dir=data_dir, version="")

    if num_shards is not None:
        tfds_num_shards = builder.info.splits["train"].num_shards
        if tfds_num_shards != DROID_SHARDS:
            raise ValueError(
                f"Number of shards of Droid dataset is expected to be {DROID_SHARDS} but is {tfds_num_shards}."
            )
        if num_shards != tfds_num_shards:
            raise ValueError(
                f"We only shard over the fixed number of shards provided by tensorflow dataset ({tfds_num_shards}), but {num_shards} shards provided instead."
            )
        if shard_index >= tfds_num_shards:
            raise ValueError(
                f"Shard index is greater than the num of shards ({shard_index} >= {num_shards})."
            )

        raw_dataset = builder.as_dataset(split=f"train[{shard_index}shard]")
    else:
        raw_dataset = builder.as_dataset(split="train")

    lerobot_dataset = LeRobotDataset.create(
        repo_id=repo_id,
        robot_type=DROID_ROBOT_TYPE,
        fps=DROID_FPS,
        features=DROID_FEATURES,
    )

    start_time = time.time()
    num_episodes = raw_dataset.cardinality().numpy().item()
    logging.info(f"Number of episodes {num_episodes}")

    for episode_index, episode in enumerate(raw_dataset):
        elapsed_time = time.time() - start_time
        d, h, m, s = get_elapsed_time_in_days_hours_minutes_seconds(elapsed_time)

        logging.info(
            f"{episode_index} / {num_episodes} episodes processed (after {d} days, {h} hours, {m} minutes, {s:.3f} seconds)"
        )

        for frame in generate_lerobot_frames(episode):
            lerobot_dataset.add_frame(frame)

        lerobot_dataset.save_episode()
        logging.info("Save_episode")

    if push_to_hub:
        lerobot_dataset.push_to_hub(
            # Add openx tag, since it belongs to the openx collection of datasets
            tags=["openx"],
            private=False,
        )


def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--raw-dir",
        type=Path,
        required=True,
        help="Directory containing input raw datasets (e.g. `path/to/dataset` or `path/to/dataset/version).",
    )
    parser.add_argument(
        "--repo-id",
        type=str,
        help="Repositery identifier on Hugging Face: a community or a user name `/` the name of the dataset, required when push-to-hub is True",
    )
    parser.add_argument(
        "--push-to-hub",
        action="store_true",
        help="Upload to hub.",
    )
    parser.add_argument(
        "--num-shards",
        type=int,
        default=None,
        help="Number of shards. Can be either None to load the full dataset, or 2048 to load one of the 2048 tensorflow dataset files.",
    )
    parser.add_argument(
        "--shard-index",
        type=int,
        default=None,
        help="Index of the shard. Can be either None to load the full dataset, or in [0,2047] to load one of the 2048 tensorflow dataset files.",
    )

    args = parser.parse_args()

    port_droid(**vars(args))


if __name__ == "__main__":
    main()