most unit tests passing (TODO: convert datasets)

This commit is contained in:
Remi Cadene 2025-04-16 21:30:58 +02:00
parent c2a05a1fde
commit 6b6a990f4c
22 changed files with 150 additions and 136 deletions

View File

@ -178,6 +178,7 @@ Under the hood, the `LeRobotDataset` format makes use of several ways to seriali
Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects: Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects:
``` ```
TODO: IMPROVE
dataset attributes: dataset attributes:
├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example: ├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example:
│ ├ observation.images.cam_high (VideoFrame): │ ├ observation.images.cam_high (VideoFrame):
@ -190,7 +191,7 @@ dataset attributes:
│ ├ timestamp (float32): timestamp in the episode │ ├ timestamp (float32): timestamp in the episode
│ ├ next.done (bool): indicates the end of en episode ; True for the last frame in each episode │ ├ next.done (bool): indicates the end of en episode ; True for the last frame in each episode
│ └ index (int64): general index in the whole dataset │ └ index (int64): general index in the whole dataset
episode_data_index: contains 2 tensors with the start and end indices of each episode meta: contains 2 tensors with the start and end indices of each episode
│ ├ from (1D int64 tensor): first frame index for each episode — shape (num episodes,) starts with 0 │ ├ from (1D int64 tensor): first frame index for each episode — shape (num episodes,) starts with 0
│ └ to: (1D int64 tensor): last frame index for each episode — shape (num episodes,) │ └ to: (1D int64 tensor): last frame index for each episode — shape (num episodes,)
├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance ├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance

View File

@ -108,7 +108,8 @@ def save_decoded_frames(
def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None: def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
ep_num_images = dataset.episode_data_index["to"][0].item() episode_index = 0
ep_num_images = dataset.meta.episodes["length"][episode_index]
if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images: if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images:
return return
@ -265,7 +266,8 @@ def benchmark_encoding_decoding(
overwrite=True, overwrite=True,
) )
ep_num_images = dataset.episode_data_index["to"][0].item() episode_index = 0
ep_num_images = dataset.meta.episodes["length"][episode_index]
width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:]) width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:])
num_pixels = width * height num_pixels = width * height
video_size_bytes = video_path.stat().st_size video_size_bytes = video_path.stat().st_size

View File

@ -78,11 +78,11 @@ print(dataset.hf_dataset)
# LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working # LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working
# with the latter, like iterating through the dataset. # with the latter, like iterating through the dataset.
# The __getitem__ iterates over the frames of the dataset. Since our datasets are also structured by # The __getitem__ iterates over the frames of the dataset. Since our datasets are also structured by
# episodes, you can access the frame indices of any episode using the episode_data_index. Here, we access # episodes, you can access the frame indices of any episode using dataset.meta.episodes. Here, we access
# frame indices associated to the first episode: # frame indices associated to the first episode:
episode_index = 0 episode_index = 0
from_idx = dataset.episode_data_index["from"][episode_index].item() from_idx = dataset.meta.episodes["dataset_from_index"][episode_index]
to_idx = dataset.episode_data_index["to"][episode_index].item() to_idx = dataset.meta.episodes["dataset_to_index"][episode_index]
# Then we grab all the image frames from the first camera: # Then we grab all the image frames from the first camera:
camera_key = dataset.meta.camera_keys[0] camera_key = dataset.meta.camera_keys[0]

View File

@ -17,7 +17,7 @@ dataset = LeRobotDataset(dataset_repo_id, episodes=[0])
# This is equivalent to `dataset = LeRobotDataset(dataset_repo_id, image_transforms=None)` # This is equivalent to `dataset = LeRobotDataset(dataset_repo_id, image_transforms=None)`
# Get the index of the first observation in the first episode # Get the index of the first observation in the first episode
first_idx = dataset.episode_data_index["from"][0].item() first_idx = dataset.meta.episodes["dataset_from_index"][0]
# Get the frame corresponding to the first camera # Get the frame corresponding to the first camera
frame = dataset[first_idx][dataset.meta.camera_keys[0]] frame = dataset[first_idx][dataset.meta.camera_keys[0]]

View File

@ -51,6 +51,7 @@ from lerobot.common.datasets.utils import (
get_features_from_robot, get_features_from_robot,
get_hf_dataset_size_in_mb, get_hf_dataset_size_in_mb,
get_hf_features_from_features, get_hf_features_from_features,
get_parquet_file_size_in_mb,
get_parquet_num_frames, get_parquet_num_frames,
get_safe_version, get_safe_version,
get_video_duration_in_s, get_video_duration_in_s,
@ -59,15 +60,16 @@ from lerobot.common.datasets.utils import (
load_episodes, load_episodes,
load_info, load_info,
load_nested_dataset, load_nested_dataset,
load_stats,
load_tasks, load_tasks,
update_chunk_file_indices, update_chunk_file_indices,
validate_episode_buffer, validate_episode_buffer,
validate_frame, validate_frame,
write_info, write_info,
write_json, write_json,
write_stats,
write_tasks, write_tasks,
) )
from lerobot.common.datasets.v30.convert_dataset_v21_to_v30 import get_parquet_file_size_in_mb
from lerobot.common.datasets.video_utils import ( from lerobot.common.datasets.video_utils import (
VideoFrame, VideoFrame,
decode_video_frames_torchvision, decode_video_frames_torchvision,
@ -111,8 +113,7 @@ class LeRobotDatasetMetadata:
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION) check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
self.tasks = load_tasks(self.root) self.tasks = load_tasks(self.root)
self.episodes = load_episodes(self.root) self.episodes = load_episodes(self.root)
# TODO(rcadene): https://huggingface.slack.com/archives/C02V51Q3800/p1743517952388249?thread_ts=1742896075.499119&cid=C02V51Q3800 self.stats = load_stats(self.root)
# self.stats = aggregate_stats(list(self.episodes_stats.values()))
def pull_from_repo( def pull_from_repo(
self, self,
@ -272,10 +273,17 @@ class LeRobotDatasetMetadata:
chunk_idx, file_idx = 0, 0 chunk_idx, file_idx = 0, 0
df["meta/episodes/chunk_index"] = [chunk_idx] df["meta/episodes/chunk_index"] = [chunk_idx]
df["meta/episodes/file_index"] = [file_idx] df["meta/episodes/file_index"] = [file_idx]
df["dataset_from_index"] = [0]
df["dataset_to_index"] = [len(df)]
else: else:
# Retrieve information from the latest parquet file # Retrieve information from the latest parquet file
latest_ep = self.episodes.with_format( latest_ep = self.episodes.with_format(
columns=["meta/episodes/chunk_index", "meta/episodes/file_index"] columns=[
"meta/episodes/chunk_index",
"meta/episodes/file_index",
"dataset_from_index",
"dataset_to_index",
]
)[-1] )[-1]
chunk_idx, file_idx = ( chunk_idx, file_idx = (
latest_ep["meta/episodes/chunk_index"], latest_ep["meta/episodes/chunk_index"],
@ -285,16 +293,18 @@ class LeRobotDatasetMetadata:
latest_path = self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx) latest_path = self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
latest_size_in_mb = get_parquet_file_size_in_mb(latest_path) latest_size_in_mb = get_parquet_file_size_in_mb(latest_path)
# Determine if a new parquet file is needed
if latest_size_in_mb + ep_size_in_mb >= self.files_size_in_mb: if latest_size_in_mb + ep_size_in_mb >= self.files_size_in_mb:
# Size limit is reached, prepare new parquet file # Size limit is reached, prepare new parquet file
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size) chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size)
df["meta/episodes/chunk_index"] = [chunk_idx]
df["meta/episodes/file_index"] = [file_idx] # Update the existing pandas dataframe with new row
else: df["meta/episodes/chunk_index"] = [chunk_idx]
# Update the existing parquet file with new row df["meta/episodes/file_index"] = [file_idx]
df["meta/episodes/chunk_index"] = [chunk_idx] df["dataset_from_index"] = [latest_ep["dataset_to_index"]]
df["meta/episodes/file_index"] = [file_idx] df["dataset_to_index"] = [latest_ep["dataset_to_index"] + len(df)]
if latest_size_in_mb + ep_size_in_mb < self.files_size_in_mb:
# Size limit wasnt reached, concatenate latest dataframe with new one
latest_df = pd.read_parquet(latest_path) latest_df = pd.read_parquet(latest_path)
df = pd.concat([latest_df, df], ignore_index=True) df = pd.concat([latest_df, df], ignore_index=True)
@ -333,8 +343,8 @@ class LeRobotDatasetMetadata:
self.update_video_info() self.update_video_info()
write_info(self.info, self.root) write_info(self.info, self.root)
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats else episode_stats self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats
# TODO: write stats write_stats(self.stats, self.root)
def update_video_info(self) -> None: def update_video_info(self) -> None:
""" """
@ -401,8 +411,7 @@ class LeRobotDatasetMetadata:
obj.tasks = None obj.tasks = None
obj.episodes = None obj.episodes = None
# TODO(rcadene) stats obj.stats = None
obj.stats = {}
obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos) obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos)
if len(obj.video_keys) > 0 and not use_videos: if len(obj.video_keys) > 0 and not use_videos:
raise ValueError() raise ValueError()

View File

@ -337,13 +337,11 @@ def compute_sampler_weights(
if len(offline_dataset) > 0: if len(offline_dataset) > 0:
offline_data_mask_indices = [] offline_data_mask_indices = []
for start_index, end_index in zip( for start_index, end_index in zip(
offline_dataset.episode_data_index["from"], offline_dataset.meta.episodes["dataset_from_index"],
offline_dataset.episode_data_index["to"], offline_dataset.meta.episodes["dataset_to_index"],
strict=True, strict=True,
): ):
offline_data_mask_indices.extend( offline_data_mask_indices.extend(range(start_index, end_index - offline_drop_n_last_frames))
range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
)
offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool) offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
offline_data_mask[torch.tensor(offline_data_mask_indices)] = True offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
weights.append( weights.append(

View File

@ -21,7 +21,8 @@ import torch
class EpisodeAwareSampler: class EpisodeAwareSampler:
def __init__( def __init__(
self, self,
episode_data_index: dict, dataset_from_indices: list[int],
dataset_to_indices: list[int],
episode_indices_to_use: Union[list, None] = None, episode_indices_to_use: Union[list, None] = None,
drop_n_first_frames: int = 0, drop_n_first_frames: int = 0,
drop_n_last_frames: int = 0, drop_n_last_frames: int = 0,
@ -30,7 +31,8 @@ class EpisodeAwareSampler:
"""Sampler that optionally incorporates episode boundary information. """Sampler that optionally incorporates episode boundary information.
Args: Args:
episode_data_index: Dictionary with keys 'from' and 'to' containing the start and end indices of each episode. dataset_from_indices: List of indices containing the start of each episode in the dataset.
dataset_to_indices: List of indices containing the end of each episode in the dataset.
episode_indices_to_use: List of episode indices to use. If None, all episodes are used. episode_indices_to_use: List of episode indices to use. If None, all episodes are used.
Assumes that episodes are indexed from 0 to N-1. Assumes that episodes are indexed from 0 to N-1.
drop_n_first_frames: Number of frames to drop from the start of each episode. drop_n_first_frames: Number of frames to drop from the start of each episode.
@ -39,12 +41,10 @@ class EpisodeAwareSampler:
""" """
indices = [] indices = []
for episode_idx, (start_index, end_index) in enumerate( for episode_idx, (start_index, end_index) in enumerate(
zip(episode_data_index["from"], episode_data_index["to"], strict=True) zip(dataset_from_indices, dataset_to_indices, strict=True)
): ):
if episode_indices_to_use is None or episode_idx in episode_indices_to_use: if episode_indices_to_use is None or episode_idx in episode_indices_to_use:
indices.extend( indices.extend(range(start_index + drop_n_first_frames, end_index - drop_n_last_frames))
range(start_index.item() + drop_n_first_frames, end_index.item() - drop_n_last_frames)
)
self.indices = indices self.indices = indices
self.shuffle = shuffle self.shuffle = shuffle

View File

@ -21,7 +21,6 @@ import shutil
import subprocess import subprocess
import tempfile import tempfile
from collections.abc import Iterator from collections.abc import Iterator
from itertools import accumulate
from pathlib import Path from pathlib import Path
from pprint import pformat from pprint import pformat
from types import SimpleNamespace from types import SimpleNamespace
@ -56,23 +55,23 @@ DEFAULT_FILE_SIZE_IN_MB = 500.0 # Max size per file
# Keep legacy for `convert_dataset_v21_to_v30.py` # Keep legacy for `convert_dataset_v21_to_v30.py`
LEGACY_EPISODES_PATH = "meta/episodes.jsonl" LEGACY_EPISODES_PATH = "meta/episodes.jsonl"
LEGACY_STATS_PATH = "meta/stats.json"
LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl" LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
LEGACY_TASKS_PATH = "meta/tasks.jsonl" LEGACY_TASKS_PATH = "meta/tasks.jsonl"
LEGACY_DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4" LEGACY_DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
LEGACY_DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet" LEGACY_DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
# TODO DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.png"
DEFAULT_IMAGE_PATH = "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png"
INFO_PATH = "meta/info.json"
STATS_PATH = "meta/stats.json"
EPISODES_DIR = "meta/episodes" EPISODES_DIR = "meta/episodes"
DATA_DIR = "data" DATA_DIR = "data"
VIDEO_DIR = "videos" VIDEO_DIR = "videos"
INFO_PATH = "meta/info.json"
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}" CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_TASKS_PATH = "meta/tasks.parquet" DEFAULT_TASKS_PATH = "meta/tasks.parquet"
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet" DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4" DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
@ -95,6 +94,12 @@ DEFAULT_FEATURES = {
} }
def get_parquet_file_size_in_mb(parquet_path):
metadata = pq.read_metadata(parquet_path)
uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size
return uncompressed_size / (1024**2)
def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int: def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int:
return hf_ds.data.nbytes / (1024**2) return hf_ds.data.nbytes / (1024**2)
@ -317,7 +322,7 @@ def load_info(local_dir: Path) -> dict:
def write_stats(stats: dict, local_dir: Path): def write_stats(stats: dict, local_dir: Path):
serialized_stats = serialize_dict(stats) serialized_stats = serialize_dict(stats)
write_json(serialized_stats, local_dir / LEGACY_STATS_PATH) write_json(serialized_stats, local_dir / STATS_PATH)
def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]: def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
@ -326,9 +331,9 @@ def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
def load_stats(local_dir: Path) -> dict[str, dict[str, np.ndarray]]: def load_stats(local_dir: Path) -> dict[str, dict[str, np.ndarray]]:
if not (local_dir / LEGACY_STATS_PATH).exists(): if not (local_dir / STATS_PATH).exists():
return None return None
stats = load_json(local_dir / LEGACY_STATS_PATH) stats = load_json(local_dir / STATS_PATH)
return cast_stats_to_numpy(stats) return cast_stats_to_numpy(stats)
@ -375,13 +380,6 @@ def write_episodes(episodes: Dataset, local_dir: Path):
if get_hf_dataset_size_in_mb(episodes) > DEFAULT_FILE_SIZE_IN_MB: if get_hf_dataset_size_in_mb(episodes) > DEFAULT_FILE_SIZE_IN_MB:
raise NotImplementedError("Contact a maintainer.") raise NotImplementedError("Contact a maintainer.")
def add_chunk_file_indices(row):
row["chunk_index"] = 0
row["file_index"] = 0
return row
episodes = episodes.map(add_chunk_file_indices)
fpath = local_dir / DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0) fpath = local_dir / DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0)
fpath.parent.mkdir(parents=True, exist_ok=True) fpath.parent.mkdir(parents=True, exist_ok=True)
episodes.to_parquet(fpath) episodes.to_parquet(fpath)
@ -642,20 +640,6 @@ def create_empty_dataset_info(
} }
def get_episode_data_index(
episode_dicts: dict[dict], episodes: list[int] | None = None
) -> dict[str, torch.Tensor]:
episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in episode_dicts.items()}
if episodes is not None:
episode_lengths = {ep_idx: episode_lengths[ep_idx] for ep_idx in episodes}
cumulative_lengths = list(accumulate(episode_lengths.values()))
return {
"from": torch.LongTensor([0] + cumulative_lengths[:-1]),
"to": torch.LongTensor(cumulative_lengths),
}
def check_timestamps_sync( def check_timestamps_sync(
timestamps: np.ndarray, timestamps: np.ndarray,
episode_indices: np.ndarray, episode_indices: np.ndarray,

View File

@ -123,10 +123,10 @@ from lerobot.common.datasets.utils import (
DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_PATH, DEFAULT_DATA_PATH,
DEFAULT_VIDEO_PATH, DEFAULT_VIDEO_PATH,
LEGACY_EPISODES_PATH,
INFO_PATH, INFO_PATH,
LEGACY_STATS_PATH, LEGACY_EPISODES_PATH,
LEGACY_TASKS_PATH, LEGACY_TASKS_PATH,
STATS_PATH,
create_branch, create_branch,
create_lerobot_dataset_card, create_lerobot_dataset_card,
flatten_dict, flatten_dict,
@ -188,7 +188,7 @@ def convert_stats_to_json(v1_dir: Path, v2_dir: Path) -> None:
serialized_stats = {key: value.tolist() for key, value in stats.items()} serialized_stats = {key: value.tolist() for key, value in stats.items()}
serialized_stats = unflatten_dict(serialized_stats) serialized_stats = unflatten_dict(serialized_stats)
json_path = v2_dir / LEGACY_STATS_PATH json_path = v2_dir / STATS_PATH
json_path.parent.mkdir(exist_ok=True, parents=True) json_path.parent.mkdir(exist_ok=True, parents=True)
with open(json_path, "w") as f: with open(json_path, "w") as f:
json.dump(serialized_stats, f, indent=4) json.dump(serialized_stats, f, indent=4)
@ -296,9 +296,7 @@ def split_parquet_by_episodes(
for ep_idx in range(ep_chunk_start, ep_chunk_end): for ep_idx in range(ep_chunk_start, ep_chunk_end):
ep_table = table.filter(pc.equal(table["episode_index"], ep_idx)) ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
episode_lengths.insert(ep_idx, len(ep_table)) episode_lengths.insert(ep_idx, len(ep_table))
output_file = output_dir / DEFAULT_DATA_PATH.format( output_file = output_dir / DEFAULT_DATA_PATH.format(episode_chunk=ep_chunk, episode_index=ep_idx)
episode_chunk=ep_chunk, episode_index=ep_idx
)
pq.write_table(ep_table, output_file) pq.write_table(ep_table, output_file)
return episode_lengths return episode_lengths

View File

@ -23,7 +23,7 @@ import logging
from huggingface_hub import HfApi from huggingface_hub import HfApi
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
from lerobot.common.datasets.utils import LEGACY_EPISODES_STATS_PATH, LEGACY_STATS_PATH, load_stats, write_info from lerobot.common.datasets.utils import LEGACY_EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
from lerobot.common.datasets.v21.convert_stats import check_aggregate_stats, convert_stats from lerobot.common.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
V20 = "v2.0" V20 = "v2.0"
@ -60,15 +60,15 @@ def convert_dataset(
dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/") dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/")
# delete old stats.json file # delete old stats.json file
if (dataset.root / LEGACY_STATS_PATH).is_file: if (dataset.root / STATS_PATH).is_file:
(dataset.root / LEGACY_STATS_PATH).unlink() (dataset.root / STATS_PATH).unlink()
hub_api = HfApi() hub_api = HfApi()
if hub_api.file_exists( if hub_api.file_exists(
repo_id=dataset.repo_id, filename=LEGACY_STATS_PATH, revision=branch, repo_type="dataset" repo_id=dataset.repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset"
): ):
hub_api.delete_file( hub_api.delete_file(
path_in_repo=LEGACY_STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset" path_in_repo=STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
) )
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset") hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")

View File

@ -18,15 +18,16 @@ python lerobot/common/datasets/v30/convert_dataset_v21_to_v30.py \
""" """
import argparse import argparse
import shutil
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
import pyarrow.parquet as pq
import tqdm import tqdm
from datasets import Dataset from datasets import Dataset
from huggingface_hub import snapshot_download from huggingface_hub import HfApi, snapshot_download
from lerobot.common.constants import HF_LEROBOT_HOME from lerobot.common.constants import HF_LEROBOT_HOME
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
from lerobot.common.datasets.utils import ( from lerobot.common.datasets.utils import (
DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_SIZE,
DEFAULT_DATA_PATH, DEFAULT_DATA_PATH,
@ -34,6 +35,7 @@ from lerobot.common.datasets.utils import (
DEFAULT_VIDEO_PATH, DEFAULT_VIDEO_PATH,
concat_video_files, concat_video_files,
flatten_dict, flatten_dict,
get_parquet_file_size_in_mb,
get_parquet_num_frames, get_parquet_num_frames,
get_video_duration_in_s, get_video_duration_in_s,
get_video_size_in_mb, get_video_size_in_mb,
@ -93,12 +95,6 @@ meta/info.json
""" """
def get_parquet_file_size_in_mb(parquet_path):
metadata = pq.read_metadata(parquet_path)
uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size
return uncompressed_size / (1024**2)
# def generate_flat_ep_stats(episodes_stats): # def generate_flat_ep_stats(episodes_stats):
# for ep_idx, ep_stats in episodes_stats.items(): # for ep_idx, ep_stats in episodes_stats.items():
# flat_ep_stats = flatten_dict(ep_stats) # flat_ep_stats = flatten_dict(ep_stats)
@ -148,8 +144,8 @@ def convert_data(root, new_root):
"episode_index": ep_idx, "episode_index": ep_idx,
"data/chunk_index": chunk_idx, "data/chunk_index": chunk_idx,
"data/file_index": file_idx, "data/file_index": file_idx,
"data/from_index": num_frames, "dataset_from_index": num_frames,
"data/to_index": num_frames + ep_num_frames, "dataset_to_index": num_frames + ep_num_frames,
} }
size_in_mb += ep_size_in_mb size_in_mb += ep_size_in_mb
num_frames += ep_num_frames num_frames += ep_num_frames
@ -337,6 +333,9 @@ def convert_dataset(
root = HF_LEROBOT_HOME / repo_id root = HF_LEROBOT_HOME / repo_id
new_root = HF_LEROBOT_HOME / f"{repo_id}_v30" new_root = HF_LEROBOT_HOME / f"{repo_id}_v30"
if new_root.is_dir():
shutil.rmtree(new_root)
snapshot_download( snapshot_download(
repo_id, repo_id,
repo_type="dataset", repo_type="dataset",
@ -350,6 +349,24 @@ def convert_dataset(
episodes_videos_metadata = convert_videos(root, new_root) episodes_videos_metadata = convert_videos(root, new_root)
convert_episodes_metadata(root, new_root, episodes_metadata, episodes_videos_metadata) convert_episodes_metadata(root, new_root, episodes_metadata, episodes_videos_metadata)
shutil.move(str(root), str(root) + "_old")
shutil.move(str(new_root), str(root))
# TODO(racdene)
if False:
hub_api = HfApi()
hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
hub_api.delete_files(
delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"],
repo_id=repo_id,
revision=branch,
repo_type="dataset",
)
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
LeRobotDataset(repo_id).push_to_hub()
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()

View File

@ -167,7 +167,8 @@ def train(cfg: TrainPipelineConfig):
if hasattr(cfg.policy, "drop_n_last_frames"): if hasattr(cfg.policy, "drop_n_last_frames"):
shuffle = False shuffle = False
sampler = EpisodeAwareSampler( sampler = EpisodeAwareSampler(
dataset.episode_data_index, dataset.meta.episodes["dataset_from_index"],
dataset.meta.episodes["dataset_to_index"],
drop_n_last_frames=cfg.policy.drop_n_last_frames, drop_n_last_frames=cfg.policy.drop_n_last_frames,
shuffle=True, shuffle=True,
) )

View File

@ -79,8 +79,8 @@ from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
class EpisodeSampler(torch.utils.data.Sampler): class EpisodeSampler(torch.utils.data.Sampler):
def __init__(self, dataset: LeRobotDataset, episode_index: int): def __init__(self, dataset: LeRobotDataset, episode_index: int):
from_idx = dataset.episode_data_index["from"][episode_index].item() from_idx = dataset.meta.episodes["dataset_from_index"][episode_index].item()
to_idx = dataset.episode_data_index["to"][episode_index].item() to_idx = dataset.meta.episodes["dataset_to_index"][episode_index].item()
self.frame_ids = range(from_idx, to_idx) self.frame_ids = range(from_idx, to_idx)
def __iter__(self) -> Iterator: def __iter__(self) -> Iterator:

View File

@ -259,8 +259,8 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
selected_columns.insert(0, "timestamp") selected_columns.insert(0, "timestamp")
if isinstance(dataset, LeRobotDataset): if isinstance(dataset, LeRobotDataset):
from_idx = dataset.episode_data_index["from"][episode_index] from_idx = dataset.meta.episodes["dataset_from_index"][episode_index]
to_idx = dataset.episode_data_index["to"][episode_index] to_idx = dataset.meta.episodes["dataset_to_index"][episode_index]
data = ( data = (
dataset.hf_dataset.select(range(from_idx, to_idx)) dataset.hf_dataset.select(range(from_idx, to_idx))
.select_columns(selected_columns) .select_columns(selected_columns)
@ -296,7 +296,7 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]: def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]:
# get first frame of episode (hack to get video_path of the episode) # get first frame of episode (hack to get video_path of the episode)
first_frame_idx = dataset.episode_data_index["from"][ep_index].item() first_frame_idx = dataset.meta.episodes["dataset_from_index"][ep_index]
return [ return [
dataset.hf_dataset.select_columns(key)[first_frame_idx][key]["path"] dataset.hf_dataset.select_columns(key)[first_frame_idx][key]["path"]
for key in dataset.meta.video_keys for key in dataset.meta.video_keys
@ -309,7 +309,7 @@ def get_episode_language_instruction(dataset: LeRobotDataset, ep_index: int) ->
return None return None
# get first frame index # get first frame index
first_frame_idx = dataset.episode_data_index["from"][ep_index].item() first_frame_idx = dataset.meta.episodes["dataset_from_index"][ep_index]
language_instruction = dataset.hf_dataset[first_frame_idx]["language_instruction"] language_instruction = dataset.hf_dataset[first_frame_idx]["language_instruction"]
# TODO (michel-aractingi) hack to get the sentence, some strings in openx are badly stored # TODO (michel-aractingi) hack to get the sentence, some strings in openx are badly stored

View File

@ -230,6 +230,8 @@ def episodes_factory(tasks_factory, stats_factory):
"meta/episodes/file_index": [], "meta/episodes/file_index": [],
"data/chunk_index": [], "data/chunk_index": [],
"data/file_index": [], "data/file_index": [],
"dataset_from_index": [],
"dataset_to_index": [],
"tasks": [], "tasks": [],
"length": [], "length": [],
} }
@ -241,6 +243,7 @@ def episodes_factory(tasks_factory, stats_factory):
for stats_key in flatten_dict({"stats": stats_factory(features)}): for stats_key in flatten_dict({"stats": stats_factory(features)}):
d[stats_key] = [] d[stats_key] = []
num_frames = 0
remaining_tasks = list(tasks.index) remaining_tasks = list(tasks.index)
for ep_idx in range(total_episodes): for ep_idx in range(total_episodes):
num_tasks_in_episode = random.randint(1, min(3, num_tasks_available)) if multi_task else 1 num_tasks_in_episode = random.randint(1, min(3, num_tasks_available)) if multi_task else 1
@ -256,6 +259,8 @@ def episodes_factory(tasks_factory, stats_factory):
d["meta/episodes/file_index"].append(0) d["meta/episodes/file_index"].append(0)
d["data/chunk_index"].append(0) d["data/chunk_index"].append(0)
d["data/file_index"].append(0) d["data/file_index"].append(0)
d["dataset_from_index"].append(num_frames)
d["dataset_to_index"].append(num_frames + lengths[ep_idx])
d["tasks"].append(episode_tasks) d["tasks"].append(episode_tasks)
d["length"].append(lengths[ep_idx]) d["length"].append(lengths[ep_idx])
@ -268,6 +273,8 @@ def episodes_factory(tasks_factory, stats_factory):
for stats_key, stats in flatten_dict({"stats": stats_factory(features)}).items(): for stats_key, stats in flatten_dict({"stats": stats_factory(features)}).items():
d[stats_key].append(stats) d[stats_key].append(stats)
num_frames += lengths[ep_idx]
return Dataset.from_dict(d) return Dataset.from_dict(d)
return _create_episodes return _create_episodes
@ -283,10 +290,10 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
) -> datasets.Dataset: ) -> datasets.Dataset:
if tasks is None: if tasks is None:
tasks = tasks_factory() tasks = tasks_factory()
if episodes is None:
episodes = episodes_factory()
if features is None: if features is None:
features = features_factory() features = features_factory()
if episodes is None:
episodes = episodes_factory(features)
timestamp_col = np.array([], dtype=np.float32) timestamp_col = np.array([], dtype=np.float32)
frame_index_col = np.array([], dtype=np.int64) frame_index_col = np.array([], dtype=np.int64)

View File

@ -10,7 +10,7 @@ from lerobot.common.datasets.utils import (
DEFAULT_EPISODES_PATH, DEFAULT_EPISODES_PATH,
DEFAULT_TASKS_PATH, DEFAULT_TASKS_PATH,
INFO_PATH, INFO_PATH,
LEGACY_STATS_PATH, STATS_PATH,
) )
from tests.fixtures.constants import LEROBOT_TEST_DIR from tests.fixtures.constants import LEROBOT_TEST_DIR
@ -70,7 +70,7 @@ def mock_snapshot_download_factory(
# List all possible files # List all possible files
all_files = [ all_files = [
INFO_PATH, INFO_PATH,
LEGACY_STATS_PATH, STATS_PATH,
# TODO(rcadene): remove naive chunk 0 file 0 ? # TODO(rcadene): remove naive chunk 0 file 0 ?
DEFAULT_TASKS_PATH.format(chunk_index=0, file_index=0), DEFAULT_TASKS_PATH.format(chunk_index=0, file_index=0),
DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0), DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0),

View File

@ -47,17 +47,23 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
) )
# save 2 first frames of first episode # save 2 first frames of first episode
i = dataset.episode_data_index["from"][0].item() i = dataset.meta.episodes["dataset_from_index"][0].item()
save_file(dataset[i], repo_dir / f"frame_{i}.safetensors") save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors") save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors")
# save 2 frames at the middle of first episode # save 2 frames at the middle of first episode
i = int((dataset.episode_data_index["to"][0].item() - dataset.episode_data_index["from"][0].item()) / 2) i = int(
(
dataset.meta.episodes["dataset_to_index"][0].item()
- dataset.meta.episodes["dataset_from_index"][0].item()
)
/ 2
)
save_file(dataset[i], repo_dir / f"frame_{i}.safetensors") save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors") save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors")
# save 2 last frames of first episode # save 2 last frames of first episode
i = dataset.episode_data_index["to"][0].item() i = dataset.meta.episodes["dataset_to_index"][0].item()
save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors") save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors")
save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors") save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors")
@ -65,17 +71,17 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
# We currently cant because our test dataset only contains the first episode # We currently cant because our test dataset only contains the first episode
# # save 2 first frames of second episode # # save 2 first frames of second episode
# i = dataset.episode_data_index["from"][1].item() # i = dataset.meta.episodes["dataset_from_index"][1].item()
# save_file(dataset[i], repo_dir / f"frame_{i}.safetensors") # save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
# save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors") # save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
# # save 2 last frames of second episode # # save 2 last frames of second episode
# i = dataset.episode_data_index["to"][1].item() # i = dataset.meta.episodes["dataset_to_index"][1].item()
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors") # save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors") # save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
# # save 2 last frames of last episode # # save 2 last frames of last episode
# i = dataset.episode_data_index["to"][-1].item() # i = dataset.meta.episodes["dataset_to_index"][-1].item()
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors") # save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors") # save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")

View File

@ -507,17 +507,23 @@ def test_backward_compatibility(repo_id):
) )
# test2 first frames of first episode # test2 first frames of first episode
i = dataset.episode_data_index["from"][0].item() i = dataset.meta.episodes["dataset_from_index"][0].item()
load_and_compare(i) load_and_compare(i)
load_and_compare(i + 1) load_and_compare(i + 1)
# test 2 frames at the middle of first episode # test 2 frames at the middle of first episode
i = int((dataset.episode_data_index["to"][0].item() - dataset.episode_data_index["from"][0].item()) / 2) i = int(
(
dataset.meta.episodes["dataset_to_index"][0].item()
- dataset.meta.episodes["dataset_from_index"][0].item()
)
/ 2
)
load_and_compare(i) load_and_compare(i)
load_and_compare(i + 1) load_and_compare(i + 1)
# test 2 last frames of first episode # test 2 last frames of first episode
i = dataset.episode_data_index["to"][0].item() i = dataset.meta.episodes["dataset_to_index"][0].item()
load_and_compare(i - 2) load_and_compare(i - 2)
load_and_compare(i - 1) load_and_compare(i - 1)
@ -525,17 +531,17 @@ def test_backward_compatibility(repo_id):
# We currently cant because our test dataset only contains the first episode # We currently cant because our test dataset only contains the first episode
# # test 2 first frames of second episode # # test 2 first frames of second episode
# i = dataset.episode_data_index["from"][1].item() # i = dataset.meta.episodes["dataset_from_index"][1].item()
# load_and_compare(i) # load_and_compare(i)
# load_and_compare(i + 1) # load_and_compare(i + 1)
# # test 2 last frames of second episode # # test 2 last frames of second episode
# i = dataset.episode_data_index["to"][1].item() # i = dataset.meta.episodes["dataset_to_index"][1].item()
# load_and_compare(i - 2) # load_and_compare(i - 2)
# load_and_compare(i - 1) # load_and_compare(i - 1)
# # test 2 last frames of last episode # # test 2 last frames of last episode
# i = dataset.episode_data_index["to"][-1].item() # i = dataset.meta.episodes["dataset_to_index"][-1].item()
# load_and_compare(i - 2) # load_and_compare(i - 2)
# load_and_compare(i - 1) # load_and_compare(i - 1)

View File

@ -43,8 +43,8 @@ def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> dict[str, np.n
def synced_timestamps_factory(hf_dataset_factory): def synced_timestamps_factory(hf_dataset_factory):
def _create_synced_timestamps(fps: int = 30) -> tuple[np.ndarray, np.ndarray, np.ndarray]: def _create_synced_timestamps(fps: int = 30) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
hf_dataset = hf_dataset_factory(fps=fps) hf_dataset = hf_dataset_factory(fps=fps)
timestamps = torch.stack(hf_dataset["timestamp"]).numpy() timestamps = hf_dataset["timestamp"].numpy()
episode_indices = torch.stack(hf_dataset["episode_index"]).numpy() episode_indices = hf_dataset["episode_index"].numpy()
episode_data_index = calculate_episode_data_index(hf_dataset) episode_data_index = calculate_episode_data_index(hf_dataset)
return timestamps, episode_indices, episode_data_index return timestamps, episode_indices, episode_data_index

View File

@ -68,7 +68,11 @@ def dummy_dataset_metadata(lerobot_dataset_metadata_factory, info_factory, tmp_p
}, },
} }
info = info_factory( info = info_factory(
total_episodes=1, total_frames=1, camera_features=camera_features, motor_features=motor_features total_episodes=1,
total_frames=1,
total_tasks=1,
camera_features=camera_features,
motor_features=motor_features,
) )
ds_meta = lerobot_dataset_metadata_factory(root=tmp_path / "init", info=info) ds_meta = lerobot_dataset_metadata_factory(root=tmp_path / "init", info=info)
return ds_meta return ds_meta

View File

@ -32,7 +32,7 @@ def test_drop_n_first_frames():
) )
dataset.set_transform(hf_transform_to_torch) dataset.set_transform(hf_transform_to_torch)
episode_data_index = calculate_episode_data_index(dataset) episode_data_index = calculate_episode_data_index(dataset)
sampler = EpisodeAwareSampler(episode_data_index, drop_n_first_frames=1) sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], drop_n_first_frames=1)
assert sampler.indices == [1, 4, 5] assert sampler.indices == [1, 4, 5]
assert len(sampler) == 3 assert len(sampler) == 3
assert list(sampler) == [1, 4, 5] assert list(sampler) == [1, 4, 5]
@ -48,7 +48,7 @@ def test_drop_n_last_frames():
) )
dataset.set_transform(hf_transform_to_torch) dataset.set_transform(hf_transform_to_torch)
episode_data_index = calculate_episode_data_index(dataset) episode_data_index = calculate_episode_data_index(dataset)
sampler = EpisodeAwareSampler(episode_data_index, drop_n_last_frames=1) sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], drop_n_last_frames=1)
assert sampler.indices == [0, 3, 4] assert sampler.indices == [0, 3, 4]
assert len(sampler) == 3 assert len(sampler) == 3
assert list(sampler) == [0, 3, 4] assert list(sampler) == [0, 3, 4]
@ -64,7 +64,9 @@ def test_episode_indices_to_use():
) )
dataset.set_transform(hf_transform_to_torch) dataset.set_transform(hf_transform_to_torch)
episode_data_index = calculate_episode_data_index(dataset) episode_data_index = calculate_episode_data_index(dataset)
sampler = EpisodeAwareSampler(episode_data_index, episode_indices_to_use=[0, 2]) sampler = EpisodeAwareSampler(
episode_data_index["from"], episode_data_index["to"], episode_indices_to_use=[0, 2]
)
assert sampler.indices == [0, 1, 3, 4, 5] assert sampler.indices == [0, 1, 3, 4, 5]
assert len(sampler) == 5 assert len(sampler) == 5
assert list(sampler) == [0, 1, 3, 4, 5] assert list(sampler) == [0, 1, 3, 4, 5]
@ -80,11 +82,11 @@ def test_shuffle():
) )
dataset.set_transform(hf_transform_to_torch) dataset.set_transform(hf_transform_to_torch)
episode_data_index = calculate_episode_data_index(dataset) episode_data_index = calculate_episode_data_index(dataset)
sampler = EpisodeAwareSampler(episode_data_index, shuffle=False) sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], shuffle=False)
assert sampler.indices == [0, 1, 2, 3, 4, 5] assert sampler.indices == [0, 1, 2, 3, 4, 5]
assert len(sampler) == 6 assert len(sampler) == 6
assert list(sampler) == [0, 1, 2, 3, 4, 5] assert list(sampler) == [0, 1, 2, 3, 4, 5]
sampler = EpisodeAwareSampler(episode_data_index, shuffle=True) sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], shuffle=True)
assert sampler.indices == [0, 1, 2, 3, 4, 5] assert sampler.indices == [0, 1, 2, 3, 4, 5]
assert len(sampler) == 6 assert len(sampler) == 6
assert set(sampler) == {0, 1, 2, 3, 4, 5} assert set(sampler) == {0, 1, 2, 3, 4, 5}

View File

@ -1,21 +0,0 @@
import torch
from datasets import Dataset
from lerobot.common.datasets.push_dataset_to_hub.utils import calculate_episode_data_index
from lerobot.common.datasets.utils import (
hf_transform_to_torch,
)
def test_calculate_episode_data_index():
dataset = Dataset.from_dict(
{
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
"index": [0, 1, 2, 3, 4, 5],
"episode_index": [0, 0, 1, 2, 2, 2],
},
)
dataset.set_transform(hf_transform_to_torch)
episode_data_index = calculate_episode_data_index(dataset)
assert torch.equal(episode_data_index["from"], torch.tensor([0, 2, 3]))
assert torch.equal(episode_data_index["to"], torch.tensor([2, 3, 6]))