diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py index 4e100d1f..cda0412f 100644 --- a/lerobot/common/datasets/lerobot_dataset.py +++ b/lerobot/common/datasets/lerobot_dataset.py @@ -13,6 +13,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import json import logging import os from pathlib import Path @@ -27,6 +28,7 @@ from lerobot.common.datasets.compute_stats import aggregate_stats from lerobot.common.datasets.utils import ( check_delta_timestamps, check_timestamps_sync, + create_dataset_info, get_delta_indices, get_episode_data_index, get_hub_safe_version, @@ -34,17 +36,12 @@ from lerobot.common.datasets.utils import ( load_metadata, ) from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision +from lerobot.common.robot_devices.robots.utils import Robot # For maintainers, see lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md CODEBASE_VERSION = "v2.0" LEROBOT_HOME = Path(os.getenv("LEROBOT_HOME", "~/.cache/huggingface/lerobot")).expanduser() -DEFAULT_CHUNK_SIZE = 1000 -DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4" -DEFAULT_PARQUET_PATH = ( - "data/chunk-{episode_chunk:03d}/train-{episode_index:05d}-of-{total_episodes:05d}.parquet" -) - class LeRobotDataset(torch.utils.data.Dataset): def __init__( @@ -400,6 +397,10 @@ class LeRobotDataset(torch.utils.data.Dataset): return item + def write_info(self) -> None: + with open(self.root / "meta/info.json", "w") as f: + json.dump(self.info, f, indent=4, ensure_ascii=False) + def __repr__(self): return ( f"{self.__class__.__name__}(\n" @@ -419,17 +420,22 @@ class LeRobotDataset(torch.utils.data.Dataset): def create( cls, repo_id: str, + fps: int, + robot: Robot, root: Path | None = None, - image_transforms: Callable | None = None, - delta_timestamps: dict[list[float]] | None = None, tolerance_s: float = 1e-4, - video_backend: str | None = None, ) -> "LeRobotDataset": """Create a LeRobot Dataset from scratch in order to record data.""" - # create an empty object of type LeRobotDataset obj = cls.__new__(cls) obj.repo_id = repo_id obj.root = root if root is not None else LEROBOT_HOME / repo_id + obj._version = CODEBASE_VERSION + + obj.root.mkdir(exist_ok=True, parents=True) + obj.info = create_dataset_info(obj._version, fps, robot) + obj.write_info() + obj.fps = fps + # obj.episodes = None # obj.image_transforms = None # obj.delta_timestamps = None diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py index bebc3c6f..c80838e6 100644 --- a/lerobot/common/datasets/utils.py +++ b/lerobot/common/datasets/utils.py @@ -28,6 +28,13 @@ from huggingface_hub import DatasetCard, HfApi from PIL import Image as PILImage from torchvision import transforms +from lerobot.common.robot_devices.robots.utils import Robot + +DEFAULT_CHUNK_SIZE = 1000 # Max number of episodes per chunk +DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4" +DEFAULT_PARQUET_PATH = ( + "data/chunk-{episode_chunk:03d}/train-{episode_index:05d}-of-{total_episodes:05d}.parquet" +) DATASET_CARD_TEMPLATE = """ --- # Metadata will go there @@ -145,7 +152,7 @@ def load_hf_dataset( def load_metadata(local_dir: Path) -> tuple[dict | list]: """Loads metadata files from a dataset.""" - info_path = local_dir / "meta/info.json" + info_path = local_dir / "meta/info.jsonl" episodes_path = local_dir / "meta/episodes.jsonl" stats_path = local_dir / "meta/stats.json" tasks_path = local_dir / "meta/tasks.json" @@ -159,8 +166,8 @@ def load_metadata(local_dir: Path) -> tuple[dict | list]: with open(stats_path) as f: stats = json.load(f) - with open(tasks_path) as f: - tasks = json.load(f) + with jsonlines.open(tasks_path, "r") as reader: + tasks = list(reader) stats = {key: torch.tensor(value) for key, value in flatten_dict(stats).items()} stats = unflatten_dict(stats) @@ -169,6 +176,28 @@ def load_metadata(local_dir: Path) -> tuple[dict | list]: return info, episode_dicts, stats, tasks +def create_dataset_info(codebase_version: str, fps: int, robot: Robot) -> dict: + return { + "codebase_version": codebase_version, + "data_path": DEFAULT_PARQUET_PATH, + "robot_type": robot.robot_type, + "total_episodes": 0, + "total_frames": 0, + "total_tasks": 0, + "total_videos": 0, + "total_chunks": 0, + "chunks_size": DEFAULT_CHUNK_SIZE, + "fps": fps, + "splits": {}, + # "keys": keys, + # "video_keys": video_keys, + # "image_keys": image_keys, + # "shapes": {**sequence_shapes, **video_shapes, **image_shapes}, + # "names": names, + # "videos": {"videos_path": DEFAULT_VIDEO_PATH} if video_keys else None, + } + + def get_episode_data_index(episodes: list, episode_dicts: list[dict]) -> dict[str, torch.Tensor]: episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in enumerate(episode_dicts)} if episodes is not None: diff --git a/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py index a498f9c1..4342ad6c 100644 --- a/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py +++ b/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py @@ -120,12 +120,15 @@ from huggingface_hub.errors import EntryNotFoundError from PIL import Image from safetensors.torch import load_file -from lerobot.common.datasets.lerobot_dataset import ( +from lerobot.common.datasets.utils import ( DEFAULT_CHUNK_SIZE, DEFAULT_PARQUET_PATH, DEFAULT_VIDEO_PATH, + create_branch, + flatten_dict, + get_hub_safe_version, + unflatten_dict, ) -from lerobot.common.datasets.utils import create_branch, flatten_dict, get_hub_safe_version, unflatten_dict from lerobot.common.utils.utils import init_hydra_config from lerobot.scripts.push_dataset_to_hub import push_dataset_card_to_hub @@ -607,8 +610,8 @@ def convert_dataset( raise ValueError assert set(tasks) == {task for ep_tasks in tasks_by_episodes.values() for task in ep_tasks} - task_json = [{"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)] - write_json(task_json, v20_dir / "meta" / "tasks.json") + tasks = [{"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)] + write_jsonlines(tasks, v20_dir / "meta" / "tasks.json") # Shapes sequence_shapes = {key: dataset.features[key].length for key in keys["sequence"]}