most unit tests passing (TODO: convert datasets)
This commit is contained in:
parent
c2a05a1fde
commit
6b6a990f4c
|
@ -178,6 +178,7 @@ Under the hood, the `LeRobotDataset` format makes use of several ways to seriali
|
||||||
Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects:
|
Here are the important details and internal structure organization of a typical `LeRobotDataset` instantiated with `dataset = LeRobotDataset("lerobot/aloha_static_coffee")`. The exact features will change from dataset to dataset but not the main aspects:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
TODO: IMPROVE
|
||||||
dataset attributes:
|
dataset attributes:
|
||||||
├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example:
|
├ hf_dataset: a Hugging Face dataset (backed by Arrow/parquet). Typical features example:
|
||||||
│ ├ observation.images.cam_high (VideoFrame):
|
│ ├ observation.images.cam_high (VideoFrame):
|
||||||
|
@ -190,7 +191,7 @@ dataset attributes:
|
||||||
│ ├ timestamp (float32): timestamp in the episode
|
│ ├ timestamp (float32): timestamp in the episode
|
||||||
│ ├ next.done (bool): indicates the end of en episode ; True for the last frame in each episode
|
│ ├ next.done (bool): indicates the end of en episode ; True for the last frame in each episode
|
||||||
│ └ index (int64): general index in the whole dataset
|
│ └ index (int64): general index in the whole dataset
|
||||||
├ episode_data_index: contains 2 tensors with the start and end indices of each episode
|
├ meta: contains 2 tensors with the start and end indices of each episode
|
||||||
│ ├ from (1D int64 tensor): first frame index for each episode — shape (num episodes,) starts with 0
|
│ ├ from (1D int64 tensor): first frame index for each episode — shape (num episodes,) starts with 0
|
||||||
│ └ to: (1D int64 tensor): last frame index for each episode — shape (num episodes,)
|
│ └ to: (1D int64 tensor): last frame index for each episode — shape (num episodes,)
|
||||||
├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance
|
├ stats: a dictionary of statistics (max, mean, min, std) for each feature in the dataset, for instance
|
||||||
|
|
|
@ -108,7 +108,8 @@ def save_decoded_frames(
|
||||||
|
|
||||||
|
|
||||||
def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
|
def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
|
||||||
ep_num_images = dataset.episode_data_index["to"][0].item()
|
episode_index = 0
|
||||||
|
ep_num_images = dataset.meta.episodes["length"][episode_index]
|
||||||
if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images:
|
if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images:
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -265,7 +266,8 @@ def benchmark_encoding_decoding(
|
||||||
overwrite=True,
|
overwrite=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
ep_num_images = dataset.episode_data_index["to"][0].item()
|
episode_index = 0
|
||||||
|
ep_num_images = dataset.meta.episodes["length"][episode_index]
|
||||||
width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:])
|
width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:])
|
||||||
num_pixels = width * height
|
num_pixels = width * height
|
||||||
video_size_bytes = video_path.stat().st_size
|
video_size_bytes = video_path.stat().st_size
|
||||||
|
|
|
@ -78,11 +78,11 @@ print(dataset.hf_dataset)
|
||||||
# LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working
|
# LeRobot datasets also subclasses PyTorch datasets so you can do everything you know and love from working
|
||||||
# with the latter, like iterating through the dataset.
|
# with the latter, like iterating through the dataset.
|
||||||
# The __getitem__ iterates over the frames of the dataset. Since our datasets are also structured by
|
# The __getitem__ iterates over the frames of the dataset. Since our datasets are also structured by
|
||||||
# episodes, you can access the frame indices of any episode using the episode_data_index. Here, we access
|
# episodes, you can access the frame indices of any episode using dataset.meta.episodes. Here, we access
|
||||||
# frame indices associated to the first episode:
|
# frame indices associated to the first episode:
|
||||||
episode_index = 0
|
episode_index = 0
|
||||||
from_idx = dataset.episode_data_index["from"][episode_index].item()
|
from_idx = dataset.meta.episodes["dataset_from_index"][episode_index]
|
||||||
to_idx = dataset.episode_data_index["to"][episode_index].item()
|
to_idx = dataset.meta.episodes["dataset_to_index"][episode_index]
|
||||||
|
|
||||||
# Then we grab all the image frames from the first camera:
|
# Then we grab all the image frames from the first camera:
|
||||||
camera_key = dataset.meta.camera_keys[0]
|
camera_key = dataset.meta.camera_keys[0]
|
||||||
|
|
|
@ -17,7 +17,7 @@ dataset = LeRobotDataset(dataset_repo_id, episodes=[0])
|
||||||
# This is equivalent to `dataset = LeRobotDataset(dataset_repo_id, image_transforms=None)`
|
# This is equivalent to `dataset = LeRobotDataset(dataset_repo_id, image_transforms=None)`
|
||||||
|
|
||||||
# Get the index of the first observation in the first episode
|
# Get the index of the first observation in the first episode
|
||||||
first_idx = dataset.episode_data_index["from"][0].item()
|
first_idx = dataset.meta.episodes["dataset_from_index"][0]
|
||||||
|
|
||||||
# Get the frame corresponding to the first camera
|
# Get the frame corresponding to the first camera
|
||||||
frame = dataset[first_idx][dataset.meta.camera_keys[0]]
|
frame = dataset[first_idx][dataset.meta.camera_keys[0]]
|
||||||
|
|
|
@ -51,6 +51,7 @@ from lerobot.common.datasets.utils import (
|
||||||
get_features_from_robot,
|
get_features_from_robot,
|
||||||
get_hf_dataset_size_in_mb,
|
get_hf_dataset_size_in_mb,
|
||||||
get_hf_features_from_features,
|
get_hf_features_from_features,
|
||||||
|
get_parquet_file_size_in_mb,
|
||||||
get_parquet_num_frames,
|
get_parquet_num_frames,
|
||||||
get_safe_version,
|
get_safe_version,
|
||||||
get_video_duration_in_s,
|
get_video_duration_in_s,
|
||||||
|
@ -59,15 +60,16 @@ from lerobot.common.datasets.utils import (
|
||||||
load_episodes,
|
load_episodes,
|
||||||
load_info,
|
load_info,
|
||||||
load_nested_dataset,
|
load_nested_dataset,
|
||||||
|
load_stats,
|
||||||
load_tasks,
|
load_tasks,
|
||||||
update_chunk_file_indices,
|
update_chunk_file_indices,
|
||||||
validate_episode_buffer,
|
validate_episode_buffer,
|
||||||
validate_frame,
|
validate_frame,
|
||||||
write_info,
|
write_info,
|
||||||
write_json,
|
write_json,
|
||||||
|
write_stats,
|
||||||
write_tasks,
|
write_tasks,
|
||||||
)
|
)
|
||||||
from lerobot.common.datasets.v30.convert_dataset_v21_to_v30 import get_parquet_file_size_in_mb
|
|
||||||
from lerobot.common.datasets.video_utils import (
|
from lerobot.common.datasets.video_utils import (
|
||||||
VideoFrame,
|
VideoFrame,
|
||||||
decode_video_frames_torchvision,
|
decode_video_frames_torchvision,
|
||||||
|
@ -111,8 +113,7 @@ class LeRobotDatasetMetadata:
|
||||||
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
|
check_version_compatibility(self.repo_id, self._version, CODEBASE_VERSION)
|
||||||
self.tasks = load_tasks(self.root)
|
self.tasks = load_tasks(self.root)
|
||||||
self.episodes = load_episodes(self.root)
|
self.episodes = load_episodes(self.root)
|
||||||
# TODO(rcadene): https://huggingface.slack.com/archives/C02V51Q3800/p1743517952388249?thread_ts=1742896075.499119&cid=C02V51Q3800
|
self.stats = load_stats(self.root)
|
||||||
# self.stats = aggregate_stats(list(self.episodes_stats.values()))
|
|
||||||
|
|
||||||
def pull_from_repo(
|
def pull_from_repo(
|
||||||
self,
|
self,
|
||||||
|
@ -272,10 +273,17 @@ class LeRobotDatasetMetadata:
|
||||||
chunk_idx, file_idx = 0, 0
|
chunk_idx, file_idx = 0, 0
|
||||||
df["meta/episodes/chunk_index"] = [chunk_idx]
|
df["meta/episodes/chunk_index"] = [chunk_idx]
|
||||||
df["meta/episodes/file_index"] = [file_idx]
|
df["meta/episodes/file_index"] = [file_idx]
|
||||||
|
df["dataset_from_index"] = [0]
|
||||||
|
df["dataset_to_index"] = [len(df)]
|
||||||
else:
|
else:
|
||||||
# Retrieve information from the latest parquet file
|
# Retrieve information from the latest parquet file
|
||||||
latest_ep = self.episodes.with_format(
|
latest_ep = self.episodes.with_format(
|
||||||
columns=["meta/episodes/chunk_index", "meta/episodes/file_index"]
|
columns=[
|
||||||
|
"meta/episodes/chunk_index",
|
||||||
|
"meta/episodes/file_index",
|
||||||
|
"dataset_from_index",
|
||||||
|
"dataset_to_index",
|
||||||
|
]
|
||||||
)[-1]
|
)[-1]
|
||||||
chunk_idx, file_idx = (
|
chunk_idx, file_idx = (
|
||||||
latest_ep["meta/episodes/chunk_index"],
|
latest_ep["meta/episodes/chunk_index"],
|
||||||
|
@ -285,16 +293,18 @@ class LeRobotDatasetMetadata:
|
||||||
latest_path = self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
latest_path = self.root / DEFAULT_EPISODES_PATH.format(chunk_index=chunk_idx, file_index=file_idx)
|
||||||
latest_size_in_mb = get_parquet_file_size_in_mb(latest_path)
|
latest_size_in_mb = get_parquet_file_size_in_mb(latest_path)
|
||||||
|
|
||||||
# Determine if a new parquet file is needed
|
|
||||||
if latest_size_in_mb + ep_size_in_mb >= self.files_size_in_mb:
|
if latest_size_in_mb + ep_size_in_mb >= self.files_size_in_mb:
|
||||||
# Size limit is reached, prepare new parquet file
|
# Size limit is reached, prepare new parquet file
|
||||||
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size)
|
chunk_idx, file_idx = update_chunk_file_indices(chunk_idx, file_idx, self.meta.chunks_size)
|
||||||
df["meta/episodes/chunk_index"] = [chunk_idx]
|
|
||||||
df["meta/episodes/file_index"] = [file_idx]
|
# Update the existing pandas dataframe with new row
|
||||||
else:
|
df["meta/episodes/chunk_index"] = [chunk_idx]
|
||||||
# Update the existing parquet file with new row
|
df["meta/episodes/file_index"] = [file_idx]
|
||||||
df["meta/episodes/chunk_index"] = [chunk_idx]
|
df["dataset_from_index"] = [latest_ep["dataset_to_index"]]
|
||||||
df["meta/episodes/file_index"] = [file_idx]
|
df["dataset_to_index"] = [latest_ep["dataset_to_index"] + len(df)]
|
||||||
|
|
||||||
|
if latest_size_in_mb + ep_size_in_mb < self.files_size_in_mb:
|
||||||
|
# Size limit wasnt reached, concatenate latest dataframe with new one
|
||||||
latest_df = pd.read_parquet(latest_path)
|
latest_df = pd.read_parquet(latest_path)
|
||||||
df = pd.concat([latest_df, df], ignore_index=True)
|
df = pd.concat([latest_df, df], ignore_index=True)
|
||||||
|
|
||||||
|
@ -333,8 +343,8 @@ class LeRobotDatasetMetadata:
|
||||||
self.update_video_info()
|
self.update_video_info()
|
||||||
write_info(self.info, self.root)
|
write_info(self.info, self.root)
|
||||||
|
|
||||||
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats else episode_stats
|
self.stats = aggregate_stats([self.stats, episode_stats]) if self.stats is not None else episode_stats
|
||||||
# TODO: write stats
|
write_stats(self.stats, self.root)
|
||||||
|
|
||||||
def update_video_info(self) -> None:
|
def update_video_info(self) -> None:
|
||||||
"""
|
"""
|
||||||
|
@ -401,8 +411,7 @@ class LeRobotDatasetMetadata:
|
||||||
|
|
||||||
obj.tasks = None
|
obj.tasks = None
|
||||||
obj.episodes = None
|
obj.episodes = None
|
||||||
# TODO(rcadene) stats
|
obj.stats = None
|
||||||
obj.stats = {}
|
|
||||||
obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos)
|
obj.info = create_empty_dataset_info(CODEBASE_VERSION, fps, robot_type, features, use_videos)
|
||||||
if len(obj.video_keys) > 0 and not use_videos:
|
if len(obj.video_keys) > 0 and not use_videos:
|
||||||
raise ValueError()
|
raise ValueError()
|
||||||
|
|
|
@ -337,13 +337,11 @@ def compute_sampler_weights(
|
||||||
if len(offline_dataset) > 0:
|
if len(offline_dataset) > 0:
|
||||||
offline_data_mask_indices = []
|
offline_data_mask_indices = []
|
||||||
for start_index, end_index in zip(
|
for start_index, end_index in zip(
|
||||||
offline_dataset.episode_data_index["from"],
|
offline_dataset.meta.episodes["dataset_from_index"],
|
||||||
offline_dataset.episode_data_index["to"],
|
offline_dataset.meta.episodes["dataset_to_index"],
|
||||||
strict=True,
|
strict=True,
|
||||||
):
|
):
|
||||||
offline_data_mask_indices.extend(
|
offline_data_mask_indices.extend(range(start_index, end_index - offline_drop_n_last_frames))
|
||||||
range(start_index.item(), end_index.item() - offline_drop_n_last_frames)
|
|
||||||
)
|
|
||||||
offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
|
offline_data_mask = torch.zeros(len(offline_dataset), dtype=torch.bool)
|
||||||
offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
|
offline_data_mask[torch.tensor(offline_data_mask_indices)] = True
|
||||||
weights.append(
|
weights.append(
|
||||||
|
|
|
@ -21,7 +21,8 @@ import torch
|
||||||
class EpisodeAwareSampler:
|
class EpisodeAwareSampler:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
episode_data_index: dict,
|
dataset_from_indices: list[int],
|
||||||
|
dataset_to_indices: list[int],
|
||||||
episode_indices_to_use: Union[list, None] = None,
|
episode_indices_to_use: Union[list, None] = None,
|
||||||
drop_n_first_frames: int = 0,
|
drop_n_first_frames: int = 0,
|
||||||
drop_n_last_frames: int = 0,
|
drop_n_last_frames: int = 0,
|
||||||
|
@ -30,7 +31,8 @@ class EpisodeAwareSampler:
|
||||||
"""Sampler that optionally incorporates episode boundary information.
|
"""Sampler that optionally incorporates episode boundary information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
episode_data_index: Dictionary with keys 'from' and 'to' containing the start and end indices of each episode.
|
dataset_from_indices: List of indices containing the start of each episode in the dataset.
|
||||||
|
dataset_to_indices: List of indices containing the end of each episode in the dataset.
|
||||||
episode_indices_to_use: List of episode indices to use. If None, all episodes are used.
|
episode_indices_to_use: List of episode indices to use. If None, all episodes are used.
|
||||||
Assumes that episodes are indexed from 0 to N-1.
|
Assumes that episodes are indexed from 0 to N-1.
|
||||||
drop_n_first_frames: Number of frames to drop from the start of each episode.
|
drop_n_first_frames: Number of frames to drop from the start of each episode.
|
||||||
|
@ -39,12 +41,10 @@ class EpisodeAwareSampler:
|
||||||
"""
|
"""
|
||||||
indices = []
|
indices = []
|
||||||
for episode_idx, (start_index, end_index) in enumerate(
|
for episode_idx, (start_index, end_index) in enumerate(
|
||||||
zip(episode_data_index["from"], episode_data_index["to"], strict=True)
|
zip(dataset_from_indices, dataset_to_indices, strict=True)
|
||||||
):
|
):
|
||||||
if episode_indices_to_use is None or episode_idx in episode_indices_to_use:
|
if episode_indices_to_use is None or episode_idx in episode_indices_to_use:
|
||||||
indices.extend(
|
indices.extend(range(start_index + drop_n_first_frames, end_index - drop_n_last_frames))
|
||||||
range(start_index.item() + drop_n_first_frames, end_index.item() - drop_n_last_frames)
|
|
||||||
)
|
|
||||||
|
|
||||||
self.indices = indices
|
self.indices = indices
|
||||||
self.shuffle = shuffle
|
self.shuffle = shuffle
|
||||||
|
|
|
@ -21,7 +21,6 @@ import shutil
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections.abc import Iterator
|
from collections.abc import Iterator
|
||||||
from itertools import accumulate
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from pprint import pformat
|
from pprint import pformat
|
||||||
from types import SimpleNamespace
|
from types import SimpleNamespace
|
||||||
|
@ -56,23 +55,23 @@ DEFAULT_FILE_SIZE_IN_MB = 500.0 # Max size per file
|
||||||
|
|
||||||
# Keep legacy for `convert_dataset_v21_to_v30.py`
|
# Keep legacy for `convert_dataset_v21_to_v30.py`
|
||||||
LEGACY_EPISODES_PATH = "meta/episodes.jsonl"
|
LEGACY_EPISODES_PATH = "meta/episodes.jsonl"
|
||||||
LEGACY_STATS_PATH = "meta/stats.json"
|
|
||||||
LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
|
LEGACY_EPISODES_STATS_PATH = "meta/episodes_stats.jsonl"
|
||||||
LEGACY_TASKS_PATH = "meta/tasks.jsonl"
|
LEGACY_TASKS_PATH = "meta/tasks.jsonl"
|
||||||
LEGACY_DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
|
LEGACY_DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
|
||||||
LEGACY_DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
|
LEGACY_DEFAULT_PARQUET_PATH = "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet"
|
||||||
|
|
||||||
# TODO
|
DEFAULT_IMAGE_PATH = "images/{image_key}/episode-{episode_index:06d}/frame-{frame_index:06d}.png"
|
||||||
DEFAULT_IMAGE_PATH = "images/{image_key}/episode_{episode_index:06d}/frame_{frame_index:06d}.png"
|
|
||||||
|
INFO_PATH = "meta/info.json"
|
||||||
|
STATS_PATH = "meta/stats.json"
|
||||||
|
|
||||||
EPISODES_DIR = "meta/episodes"
|
EPISODES_DIR = "meta/episodes"
|
||||||
DATA_DIR = "data"
|
DATA_DIR = "data"
|
||||||
VIDEO_DIR = "videos"
|
VIDEO_DIR = "videos"
|
||||||
|
|
||||||
INFO_PATH = "meta/info.json"
|
|
||||||
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
|
CHUNK_FILE_PATTERN = "chunk-{chunk_index:03d}/file-{file_index:03d}"
|
||||||
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
|
||||||
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
|
DEFAULT_TASKS_PATH = "meta/tasks.parquet"
|
||||||
|
DEFAULT_EPISODES_PATH = EPISODES_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
||||||
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
DEFAULT_DATA_PATH = DATA_DIR + "/" + CHUNK_FILE_PATTERN + ".parquet"
|
||||||
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
|
DEFAULT_VIDEO_PATH = VIDEO_DIR + "/{video_key}/" + CHUNK_FILE_PATTERN + ".mp4"
|
||||||
|
|
||||||
|
@ -95,6 +94,12 @@ DEFAULT_FEATURES = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_parquet_file_size_in_mb(parquet_path):
|
||||||
|
metadata = pq.read_metadata(parquet_path)
|
||||||
|
uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size
|
||||||
|
return uncompressed_size / (1024**2)
|
||||||
|
|
||||||
|
|
||||||
def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int:
|
def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int:
|
||||||
return hf_ds.data.nbytes / (1024**2)
|
return hf_ds.data.nbytes / (1024**2)
|
||||||
|
|
||||||
|
@ -317,7 +322,7 @@ def load_info(local_dir: Path) -> dict:
|
||||||
|
|
||||||
def write_stats(stats: dict, local_dir: Path):
|
def write_stats(stats: dict, local_dir: Path):
|
||||||
serialized_stats = serialize_dict(stats)
|
serialized_stats = serialize_dict(stats)
|
||||||
write_json(serialized_stats, local_dir / LEGACY_STATS_PATH)
|
write_json(serialized_stats, local_dir / STATS_PATH)
|
||||||
|
|
||||||
|
|
||||||
def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
|
def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
|
||||||
|
@ -326,9 +331,9 @@ def cast_stats_to_numpy(stats) -> dict[str, dict[str, np.ndarray]]:
|
||||||
|
|
||||||
|
|
||||||
def load_stats(local_dir: Path) -> dict[str, dict[str, np.ndarray]]:
|
def load_stats(local_dir: Path) -> dict[str, dict[str, np.ndarray]]:
|
||||||
if not (local_dir / LEGACY_STATS_PATH).exists():
|
if not (local_dir / STATS_PATH).exists():
|
||||||
return None
|
return None
|
||||||
stats = load_json(local_dir / LEGACY_STATS_PATH)
|
stats = load_json(local_dir / STATS_PATH)
|
||||||
return cast_stats_to_numpy(stats)
|
return cast_stats_to_numpy(stats)
|
||||||
|
|
||||||
|
|
||||||
|
@ -375,13 +380,6 @@ def write_episodes(episodes: Dataset, local_dir: Path):
|
||||||
if get_hf_dataset_size_in_mb(episodes) > DEFAULT_FILE_SIZE_IN_MB:
|
if get_hf_dataset_size_in_mb(episodes) > DEFAULT_FILE_SIZE_IN_MB:
|
||||||
raise NotImplementedError("Contact a maintainer.")
|
raise NotImplementedError("Contact a maintainer.")
|
||||||
|
|
||||||
def add_chunk_file_indices(row):
|
|
||||||
row["chunk_index"] = 0
|
|
||||||
row["file_index"] = 0
|
|
||||||
return row
|
|
||||||
|
|
||||||
episodes = episodes.map(add_chunk_file_indices)
|
|
||||||
|
|
||||||
fpath = local_dir / DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0)
|
fpath = local_dir / DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0)
|
||||||
fpath.parent.mkdir(parents=True, exist_ok=True)
|
fpath.parent.mkdir(parents=True, exist_ok=True)
|
||||||
episodes.to_parquet(fpath)
|
episodes.to_parquet(fpath)
|
||||||
|
@ -642,20 +640,6 @@ def create_empty_dataset_info(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def get_episode_data_index(
|
|
||||||
episode_dicts: dict[dict], episodes: list[int] | None = None
|
|
||||||
) -> dict[str, torch.Tensor]:
|
|
||||||
episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in episode_dicts.items()}
|
|
||||||
if episodes is not None:
|
|
||||||
episode_lengths = {ep_idx: episode_lengths[ep_idx] for ep_idx in episodes}
|
|
||||||
|
|
||||||
cumulative_lengths = list(accumulate(episode_lengths.values()))
|
|
||||||
return {
|
|
||||||
"from": torch.LongTensor([0] + cumulative_lengths[:-1]),
|
|
||||||
"to": torch.LongTensor(cumulative_lengths),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def check_timestamps_sync(
|
def check_timestamps_sync(
|
||||||
timestamps: np.ndarray,
|
timestamps: np.ndarray,
|
||||||
episode_indices: np.ndarray,
|
episode_indices: np.ndarray,
|
||||||
|
|
|
@ -123,10 +123,10 @@ from lerobot.common.datasets.utils import (
|
||||||
DEFAULT_CHUNK_SIZE,
|
DEFAULT_CHUNK_SIZE,
|
||||||
DEFAULT_DATA_PATH,
|
DEFAULT_DATA_PATH,
|
||||||
DEFAULT_VIDEO_PATH,
|
DEFAULT_VIDEO_PATH,
|
||||||
LEGACY_EPISODES_PATH,
|
|
||||||
INFO_PATH,
|
INFO_PATH,
|
||||||
LEGACY_STATS_PATH,
|
LEGACY_EPISODES_PATH,
|
||||||
LEGACY_TASKS_PATH,
|
LEGACY_TASKS_PATH,
|
||||||
|
STATS_PATH,
|
||||||
create_branch,
|
create_branch,
|
||||||
create_lerobot_dataset_card,
|
create_lerobot_dataset_card,
|
||||||
flatten_dict,
|
flatten_dict,
|
||||||
|
@ -188,7 +188,7 @@ def convert_stats_to_json(v1_dir: Path, v2_dir: Path) -> None:
|
||||||
serialized_stats = {key: value.tolist() for key, value in stats.items()}
|
serialized_stats = {key: value.tolist() for key, value in stats.items()}
|
||||||
serialized_stats = unflatten_dict(serialized_stats)
|
serialized_stats = unflatten_dict(serialized_stats)
|
||||||
|
|
||||||
json_path = v2_dir / LEGACY_STATS_PATH
|
json_path = v2_dir / STATS_PATH
|
||||||
json_path.parent.mkdir(exist_ok=True, parents=True)
|
json_path.parent.mkdir(exist_ok=True, parents=True)
|
||||||
with open(json_path, "w") as f:
|
with open(json_path, "w") as f:
|
||||||
json.dump(serialized_stats, f, indent=4)
|
json.dump(serialized_stats, f, indent=4)
|
||||||
|
@ -296,9 +296,7 @@ def split_parquet_by_episodes(
|
||||||
for ep_idx in range(ep_chunk_start, ep_chunk_end):
|
for ep_idx in range(ep_chunk_start, ep_chunk_end):
|
||||||
ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
|
ep_table = table.filter(pc.equal(table["episode_index"], ep_idx))
|
||||||
episode_lengths.insert(ep_idx, len(ep_table))
|
episode_lengths.insert(ep_idx, len(ep_table))
|
||||||
output_file = output_dir / DEFAULT_DATA_PATH.format(
|
output_file = output_dir / DEFAULT_DATA_PATH.format(episode_chunk=ep_chunk, episode_index=ep_idx)
|
||||||
episode_chunk=ep_chunk, episode_index=ep_idx
|
|
||||||
)
|
|
||||||
pq.write_table(ep_table, output_file)
|
pq.write_table(ep_table, output_file)
|
||||||
|
|
||||||
return episode_lengths
|
return episode_lengths
|
||||||
|
|
|
@ -23,7 +23,7 @@ import logging
|
||||||
from huggingface_hub import HfApi
|
from huggingface_hub import HfApi
|
||||||
|
|
||||||
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
|
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
|
||||||
from lerobot.common.datasets.utils import LEGACY_EPISODES_STATS_PATH, LEGACY_STATS_PATH, load_stats, write_info
|
from lerobot.common.datasets.utils import LEGACY_EPISODES_STATS_PATH, STATS_PATH, load_stats, write_info
|
||||||
from lerobot.common.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
|
from lerobot.common.datasets.v21.convert_stats import check_aggregate_stats, convert_stats
|
||||||
|
|
||||||
V20 = "v2.0"
|
V20 = "v2.0"
|
||||||
|
@ -60,15 +60,15 @@ def convert_dataset(
|
||||||
dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/")
|
dataset.push_to_hub(branch=branch, tag_version=False, allow_patterns="meta/")
|
||||||
|
|
||||||
# delete old stats.json file
|
# delete old stats.json file
|
||||||
if (dataset.root / LEGACY_STATS_PATH).is_file:
|
if (dataset.root / STATS_PATH).is_file:
|
||||||
(dataset.root / LEGACY_STATS_PATH).unlink()
|
(dataset.root / STATS_PATH).unlink()
|
||||||
|
|
||||||
hub_api = HfApi()
|
hub_api = HfApi()
|
||||||
if hub_api.file_exists(
|
if hub_api.file_exists(
|
||||||
repo_id=dataset.repo_id, filename=LEGACY_STATS_PATH, revision=branch, repo_type="dataset"
|
repo_id=dataset.repo_id, filename=STATS_PATH, revision=branch, repo_type="dataset"
|
||||||
):
|
):
|
||||||
hub_api.delete_file(
|
hub_api.delete_file(
|
||||||
path_in_repo=LEGACY_STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
|
path_in_repo=STATS_PATH, repo_id=dataset.repo_id, revision=branch, repo_type="dataset"
|
||||||
)
|
)
|
||||||
|
|
||||||
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
|
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
|
||||||
|
|
|
@ -18,15 +18,16 @@ python lerobot/common/datasets/v30/convert_dataset_v21_to_v30.py \
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pyarrow.parquet as pq
|
|
||||||
import tqdm
|
import tqdm
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import HfApi, snapshot_download
|
||||||
|
|
||||||
from lerobot.common.constants import HF_LEROBOT_HOME
|
from lerobot.common.constants import HF_LEROBOT_HOME
|
||||||
|
from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION, LeRobotDataset
|
||||||
from lerobot.common.datasets.utils import (
|
from lerobot.common.datasets.utils import (
|
||||||
DEFAULT_CHUNK_SIZE,
|
DEFAULT_CHUNK_SIZE,
|
||||||
DEFAULT_DATA_PATH,
|
DEFAULT_DATA_PATH,
|
||||||
|
@ -34,6 +35,7 @@ from lerobot.common.datasets.utils import (
|
||||||
DEFAULT_VIDEO_PATH,
|
DEFAULT_VIDEO_PATH,
|
||||||
concat_video_files,
|
concat_video_files,
|
||||||
flatten_dict,
|
flatten_dict,
|
||||||
|
get_parquet_file_size_in_mb,
|
||||||
get_parquet_num_frames,
|
get_parquet_num_frames,
|
||||||
get_video_duration_in_s,
|
get_video_duration_in_s,
|
||||||
get_video_size_in_mb,
|
get_video_size_in_mb,
|
||||||
|
@ -93,12 +95,6 @@ meta/info.json
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def get_parquet_file_size_in_mb(parquet_path):
|
|
||||||
metadata = pq.read_metadata(parquet_path)
|
|
||||||
uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size
|
|
||||||
return uncompressed_size / (1024**2)
|
|
||||||
|
|
||||||
|
|
||||||
# def generate_flat_ep_stats(episodes_stats):
|
# def generate_flat_ep_stats(episodes_stats):
|
||||||
# for ep_idx, ep_stats in episodes_stats.items():
|
# for ep_idx, ep_stats in episodes_stats.items():
|
||||||
# flat_ep_stats = flatten_dict(ep_stats)
|
# flat_ep_stats = flatten_dict(ep_stats)
|
||||||
|
@ -148,8 +144,8 @@ def convert_data(root, new_root):
|
||||||
"episode_index": ep_idx,
|
"episode_index": ep_idx,
|
||||||
"data/chunk_index": chunk_idx,
|
"data/chunk_index": chunk_idx,
|
||||||
"data/file_index": file_idx,
|
"data/file_index": file_idx,
|
||||||
"data/from_index": num_frames,
|
"dataset_from_index": num_frames,
|
||||||
"data/to_index": num_frames + ep_num_frames,
|
"dataset_to_index": num_frames + ep_num_frames,
|
||||||
}
|
}
|
||||||
size_in_mb += ep_size_in_mb
|
size_in_mb += ep_size_in_mb
|
||||||
num_frames += ep_num_frames
|
num_frames += ep_num_frames
|
||||||
|
@ -337,6 +333,9 @@ def convert_dataset(
|
||||||
root = HF_LEROBOT_HOME / repo_id
|
root = HF_LEROBOT_HOME / repo_id
|
||||||
new_root = HF_LEROBOT_HOME / f"{repo_id}_v30"
|
new_root = HF_LEROBOT_HOME / f"{repo_id}_v30"
|
||||||
|
|
||||||
|
if new_root.is_dir():
|
||||||
|
shutil.rmtree(new_root)
|
||||||
|
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
repo_id,
|
repo_id,
|
||||||
repo_type="dataset",
|
repo_type="dataset",
|
||||||
|
@ -350,6 +349,24 @@ def convert_dataset(
|
||||||
episodes_videos_metadata = convert_videos(root, new_root)
|
episodes_videos_metadata = convert_videos(root, new_root)
|
||||||
convert_episodes_metadata(root, new_root, episodes_metadata, episodes_videos_metadata)
|
convert_episodes_metadata(root, new_root, episodes_metadata, episodes_videos_metadata)
|
||||||
|
|
||||||
|
shutil.move(str(root), str(root) + "_old")
|
||||||
|
shutil.move(str(new_root), str(root))
|
||||||
|
|
||||||
|
# TODO(racdene)
|
||||||
|
if False:
|
||||||
|
hub_api = HfApi()
|
||||||
|
hub_api.delete_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
|
||||||
|
hub_api.delete_files(
|
||||||
|
delete_patterns=["data/chunk*/episode_*", "meta/*.jsonl", "videos/chunk*"],
|
||||||
|
repo_id=repo_id,
|
||||||
|
revision=branch,
|
||||||
|
repo_type="dataset",
|
||||||
|
)
|
||||||
|
|
||||||
|
hub_api.create_tag(repo_id, tag=CODEBASE_VERSION, revision=branch, repo_type="dataset")
|
||||||
|
|
||||||
|
LeRobotDataset(repo_id).push_to_hub()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
|
@ -167,7 +167,8 @@ def train(cfg: TrainPipelineConfig):
|
||||||
if hasattr(cfg.policy, "drop_n_last_frames"):
|
if hasattr(cfg.policy, "drop_n_last_frames"):
|
||||||
shuffle = False
|
shuffle = False
|
||||||
sampler = EpisodeAwareSampler(
|
sampler = EpisodeAwareSampler(
|
||||||
dataset.episode_data_index,
|
dataset.meta.episodes["dataset_from_index"],
|
||||||
|
dataset.meta.episodes["dataset_to_index"],
|
||||||
drop_n_last_frames=cfg.policy.drop_n_last_frames,
|
drop_n_last_frames=cfg.policy.drop_n_last_frames,
|
||||||
shuffle=True,
|
shuffle=True,
|
||||||
)
|
)
|
||||||
|
|
|
@ -79,8 +79,8 @@ from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
|
||||||
|
|
||||||
class EpisodeSampler(torch.utils.data.Sampler):
|
class EpisodeSampler(torch.utils.data.Sampler):
|
||||||
def __init__(self, dataset: LeRobotDataset, episode_index: int):
|
def __init__(self, dataset: LeRobotDataset, episode_index: int):
|
||||||
from_idx = dataset.episode_data_index["from"][episode_index].item()
|
from_idx = dataset.meta.episodes["dataset_from_index"][episode_index].item()
|
||||||
to_idx = dataset.episode_data_index["to"][episode_index].item()
|
to_idx = dataset.meta.episodes["dataset_to_index"][episode_index].item()
|
||||||
self.frame_ids = range(from_idx, to_idx)
|
self.frame_ids = range(from_idx, to_idx)
|
||||||
|
|
||||||
def __iter__(self) -> Iterator:
|
def __iter__(self) -> Iterator:
|
||||||
|
|
|
@ -259,8 +259,8 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
|
||||||
selected_columns.insert(0, "timestamp")
|
selected_columns.insert(0, "timestamp")
|
||||||
|
|
||||||
if isinstance(dataset, LeRobotDataset):
|
if isinstance(dataset, LeRobotDataset):
|
||||||
from_idx = dataset.episode_data_index["from"][episode_index]
|
from_idx = dataset.meta.episodes["dataset_from_index"][episode_index]
|
||||||
to_idx = dataset.episode_data_index["to"][episode_index]
|
to_idx = dataset.meta.episodes["dataset_to_index"][episode_index]
|
||||||
data = (
|
data = (
|
||||||
dataset.hf_dataset.select(range(from_idx, to_idx))
|
dataset.hf_dataset.select(range(from_idx, to_idx))
|
||||||
.select_columns(selected_columns)
|
.select_columns(selected_columns)
|
||||||
|
@ -296,7 +296,7 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index)
|
||||||
|
|
||||||
def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]:
|
def get_episode_video_paths(dataset: LeRobotDataset, ep_index: int) -> list[str]:
|
||||||
# get first frame of episode (hack to get video_path of the episode)
|
# get first frame of episode (hack to get video_path of the episode)
|
||||||
first_frame_idx = dataset.episode_data_index["from"][ep_index].item()
|
first_frame_idx = dataset.meta.episodes["dataset_from_index"][ep_index]
|
||||||
return [
|
return [
|
||||||
dataset.hf_dataset.select_columns(key)[first_frame_idx][key]["path"]
|
dataset.hf_dataset.select_columns(key)[first_frame_idx][key]["path"]
|
||||||
for key in dataset.meta.video_keys
|
for key in dataset.meta.video_keys
|
||||||
|
@ -309,7 +309,7 @@ def get_episode_language_instruction(dataset: LeRobotDataset, ep_index: int) ->
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# get first frame index
|
# get first frame index
|
||||||
first_frame_idx = dataset.episode_data_index["from"][ep_index].item()
|
first_frame_idx = dataset.meta.episodes["dataset_from_index"][ep_index]
|
||||||
|
|
||||||
language_instruction = dataset.hf_dataset[first_frame_idx]["language_instruction"]
|
language_instruction = dataset.hf_dataset[first_frame_idx]["language_instruction"]
|
||||||
# TODO (michel-aractingi) hack to get the sentence, some strings in openx are badly stored
|
# TODO (michel-aractingi) hack to get the sentence, some strings in openx are badly stored
|
||||||
|
|
|
@ -230,6 +230,8 @@ def episodes_factory(tasks_factory, stats_factory):
|
||||||
"meta/episodes/file_index": [],
|
"meta/episodes/file_index": [],
|
||||||
"data/chunk_index": [],
|
"data/chunk_index": [],
|
||||||
"data/file_index": [],
|
"data/file_index": [],
|
||||||
|
"dataset_from_index": [],
|
||||||
|
"dataset_to_index": [],
|
||||||
"tasks": [],
|
"tasks": [],
|
||||||
"length": [],
|
"length": [],
|
||||||
}
|
}
|
||||||
|
@ -241,6 +243,7 @@ def episodes_factory(tasks_factory, stats_factory):
|
||||||
for stats_key in flatten_dict({"stats": stats_factory(features)}):
|
for stats_key in flatten_dict({"stats": stats_factory(features)}):
|
||||||
d[stats_key] = []
|
d[stats_key] = []
|
||||||
|
|
||||||
|
num_frames = 0
|
||||||
remaining_tasks = list(tasks.index)
|
remaining_tasks = list(tasks.index)
|
||||||
for ep_idx in range(total_episodes):
|
for ep_idx in range(total_episodes):
|
||||||
num_tasks_in_episode = random.randint(1, min(3, num_tasks_available)) if multi_task else 1
|
num_tasks_in_episode = random.randint(1, min(3, num_tasks_available)) if multi_task else 1
|
||||||
|
@ -256,6 +259,8 @@ def episodes_factory(tasks_factory, stats_factory):
|
||||||
d["meta/episodes/file_index"].append(0)
|
d["meta/episodes/file_index"].append(0)
|
||||||
d["data/chunk_index"].append(0)
|
d["data/chunk_index"].append(0)
|
||||||
d["data/file_index"].append(0)
|
d["data/file_index"].append(0)
|
||||||
|
d["dataset_from_index"].append(num_frames)
|
||||||
|
d["dataset_to_index"].append(num_frames + lengths[ep_idx])
|
||||||
d["tasks"].append(episode_tasks)
|
d["tasks"].append(episode_tasks)
|
||||||
d["length"].append(lengths[ep_idx])
|
d["length"].append(lengths[ep_idx])
|
||||||
|
|
||||||
|
@ -268,6 +273,8 @@ def episodes_factory(tasks_factory, stats_factory):
|
||||||
for stats_key, stats in flatten_dict({"stats": stats_factory(features)}).items():
|
for stats_key, stats in flatten_dict({"stats": stats_factory(features)}).items():
|
||||||
d[stats_key].append(stats)
|
d[stats_key].append(stats)
|
||||||
|
|
||||||
|
num_frames += lengths[ep_idx]
|
||||||
|
|
||||||
return Dataset.from_dict(d)
|
return Dataset.from_dict(d)
|
||||||
|
|
||||||
return _create_episodes
|
return _create_episodes
|
||||||
|
@ -283,10 +290,10 @@ def hf_dataset_factory(features_factory, tasks_factory, episodes_factory, img_ar
|
||||||
) -> datasets.Dataset:
|
) -> datasets.Dataset:
|
||||||
if tasks is None:
|
if tasks is None:
|
||||||
tasks = tasks_factory()
|
tasks = tasks_factory()
|
||||||
if episodes is None:
|
|
||||||
episodes = episodes_factory()
|
|
||||||
if features is None:
|
if features is None:
|
||||||
features = features_factory()
|
features = features_factory()
|
||||||
|
if episodes is None:
|
||||||
|
episodes = episodes_factory(features)
|
||||||
|
|
||||||
timestamp_col = np.array([], dtype=np.float32)
|
timestamp_col = np.array([], dtype=np.float32)
|
||||||
frame_index_col = np.array([], dtype=np.int64)
|
frame_index_col = np.array([], dtype=np.int64)
|
||||||
|
|
|
@ -10,7 +10,7 @@ from lerobot.common.datasets.utils import (
|
||||||
DEFAULT_EPISODES_PATH,
|
DEFAULT_EPISODES_PATH,
|
||||||
DEFAULT_TASKS_PATH,
|
DEFAULT_TASKS_PATH,
|
||||||
INFO_PATH,
|
INFO_PATH,
|
||||||
LEGACY_STATS_PATH,
|
STATS_PATH,
|
||||||
)
|
)
|
||||||
from tests.fixtures.constants import LEROBOT_TEST_DIR
|
from tests.fixtures.constants import LEROBOT_TEST_DIR
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ def mock_snapshot_download_factory(
|
||||||
# List all possible files
|
# List all possible files
|
||||||
all_files = [
|
all_files = [
|
||||||
INFO_PATH,
|
INFO_PATH,
|
||||||
LEGACY_STATS_PATH,
|
STATS_PATH,
|
||||||
# TODO(rcadene): remove naive chunk 0 file 0 ?
|
# TODO(rcadene): remove naive chunk 0 file 0 ?
|
||||||
DEFAULT_TASKS_PATH.format(chunk_index=0, file_index=0),
|
DEFAULT_TASKS_PATH.format(chunk_index=0, file_index=0),
|
||||||
DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0),
|
DEFAULT_EPISODES_PATH.format(chunk_index=0, file_index=0),
|
||||||
|
|
|
@ -47,17 +47,23 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
|
||||||
)
|
)
|
||||||
|
|
||||||
# save 2 first frames of first episode
|
# save 2 first frames of first episode
|
||||||
i = dataset.episode_data_index["from"][0].item()
|
i = dataset.meta.episodes["dataset_from_index"][0].item()
|
||||||
save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
|
save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
|
||||||
save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors")
|
save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors")
|
||||||
|
|
||||||
# save 2 frames at the middle of first episode
|
# save 2 frames at the middle of first episode
|
||||||
i = int((dataset.episode_data_index["to"][0].item() - dataset.episode_data_index["from"][0].item()) / 2)
|
i = int(
|
||||||
|
(
|
||||||
|
dataset.meta.episodes["dataset_to_index"][0].item()
|
||||||
|
- dataset.meta.episodes["dataset_from_index"][0].item()
|
||||||
|
)
|
||||||
|
/ 2
|
||||||
|
)
|
||||||
save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
|
save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
|
||||||
save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors")
|
save_file(dataset[i + 1], repo_dir / f"frame_{i + 1}.safetensors")
|
||||||
|
|
||||||
# save 2 last frames of first episode
|
# save 2 last frames of first episode
|
||||||
i = dataset.episode_data_index["to"][0].item()
|
i = dataset.meta.episodes["dataset_to_index"][0].item()
|
||||||
save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors")
|
save_file(dataset[i - 2], repo_dir / f"frame_{i - 2}.safetensors")
|
||||||
save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors")
|
save_file(dataset[i - 1], repo_dir / f"frame_{i - 1}.safetensors")
|
||||||
|
|
||||||
|
@ -65,17 +71,17 @@ def save_dataset_to_safetensors(output_dir, repo_id="lerobot/pusht"):
|
||||||
# We currently cant because our test dataset only contains the first episode
|
# We currently cant because our test dataset only contains the first episode
|
||||||
|
|
||||||
# # save 2 first frames of second episode
|
# # save 2 first frames of second episode
|
||||||
# i = dataset.episode_data_index["from"][1].item()
|
# i = dataset.meta.episodes["dataset_from_index"][1].item()
|
||||||
# save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
|
# save_file(dataset[i], repo_dir / f"frame_{i}.safetensors")
|
||||||
# save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
|
# save_file(dataset[i + 1], repo_dir / f"frame_{i+1}.safetensors")
|
||||||
|
|
||||||
# # save 2 last frames of second episode
|
# # save 2 last frames of second episode
|
||||||
# i = dataset.episode_data_index["to"][1].item()
|
# i = dataset.meta.episodes["dataset_to_index"][1].item()
|
||||||
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
|
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
|
||||||
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
|
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
|
||||||
|
|
||||||
# # save 2 last frames of last episode
|
# # save 2 last frames of last episode
|
||||||
# i = dataset.episode_data_index["to"][-1].item()
|
# i = dataset.meta.episodes["dataset_to_index"][-1].item()
|
||||||
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
|
# save_file(dataset[i - 2], repo_dir / f"frame_{i-2}.safetensors")
|
||||||
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
|
# save_file(dataset[i - 1], repo_dir / f"frame_{i-1}.safetensors")
|
||||||
|
|
||||||
|
|
|
@ -507,17 +507,23 @@ def test_backward_compatibility(repo_id):
|
||||||
)
|
)
|
||||||
|
|
||||||
# test2 first frames of first episode
|
# test2 first frames of first episode
|
||||||
i = dataset.episode_data_index["from"][0].item()
|
i = dataset.meta.episodes["dataset_from_index"][0].item()
|
||||||
load_and_compare(i)
|
load_and_compare(i)
|
||||||
load_and_compare(i + 1)
|
load_and_compare(i + 1)
|
||||||
|
|
||||||
# test 2 frames at the middle of first episode
|
# test 2 frames at the middle of first episode
|
||||||
i = int((dataset.episode_data_index["to"][0].item() - dataset.episode_data_index["from"][0].item()) / 2)
|
i = int(
|
||||||
|
(
|
||||||
|
dataset.meta.episodes["dataset_to_index"][0].item()
|
||||||
|
- dataset.meta.episodes["dataset_from_index"][0].item()
|
||||||
|
)
|
||||||
|
/ 2
|
||||||
|
)
|
||||||
load_and_compare(i)
|
load_and_compare(i)
|
||||||
load_and_compare(i + 1)
|
load_and_compare(i + 1)
|
||||||
|
|
||||||
# test 2 last frames of first episode
|
# test 2 last frames of first episode
|
||||||
i = dataset.episode_data_index["to"][0].item()
|
i = dataset.meta.episodes["dataset_to_index"][0].item()
|
||||||
load_and_compare(i - 2)
|
load_and_compare(i - 2)
|
||||||
load_and_compare(i - 1)
|
load_and_compare(i - 1)
|
||||||
|
|
||||||
|
@ -525,17 +531,17 @@ def test_backward_compatibility(repo_id):
|
||||||
# We currently cant because our test dataset only contains the first episode
|
# We currently cant because our test dataset only contains the first episode
|
||||||
|
|
||||||
# # test 2 first frames of second episode
|
# # test 2 first frames of second episode
|
||||||
# i = dataset.episode_data_index["from"][1].item()
|
# i = dataset.meta.episodes["dataset_from_index"][1].item()
|
||||||
# load_and_compare(i)
|
# load_and_compare(i)
|
||||||
# load_and_compare(i + 1)
|
# load_and_compare(i + 1)
|
||||||
|
|
||||||
# # test 2 last frames of second episode
|
# # test 2 last frames of second episode
|
||||||
# i = dataset.episode_data_index["to"][1].item()
|
# i = dataset.meta.episodes["dataset_to_index"][1].item()
|
||||||
# load_and_compare(i - 2)
|
# load_and_compare(i - 2)
|
||||||
# load_and_compare(i - 1)
|
# load_and_compare(i - 1)
|
||||||
|
|
||||||
# # test 2 last frames of last episode
|
# # test 2 last frames of last episode
|
||||||
# i = dataset.episode_data_index["to"][-1].item()
|
# i = dataset.meta.episodes["dataset_to_index"][-1].item()
|
||||||
# load_and_compare(i - 2)
|
# load_and_compare(i - 2)
|
||||||
# load_and_compare(i - 1)
|
# load_and_compare(i - 1)
|
||||||
|
|
||||||
|
|
|
@ -43,8 +43,8 @@ def calculate_episode_data_index(hf_dataset: datasets.Dataset) -> dict[str, np.n
|
||||||
def synced_timestamps_factory(hf_dataset_factory):
|
def synced_timestamps_factory(hf_dataset_factory):
|
||||||
def _create_synced_timestamps(fps: int = 30) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
def _create_synced_timestamps(fps: int = 30) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
|
||||||
hf_dataset = hf_dataset_factory(fps=fps)
|
hf_dataset = hf_dataset_factory(fps=fps)
|
||||||
timestamps = torch.stack(hf_dataset["timestamp"]).numpy()
|
timestamps = hf_dataset["timestamp"].numpy()
|
||||||
episode_indices = torch.stack(hf_dataset["episode_index"]).numpy()
|
episode_indices = hf_dataset["episode_index"].numpy()
|
||||||
episode_data_index = calculate_episode_data_index(hf_dataset)
|
episode_data_index = calculate_episode_data_index(hf_dataset)
|
||||||
return timestamps, episode_indices, episode_data_index
|
return timestamps, episode_indices, episode_data_index
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,11 @@ def dummy_dataset_metadata(lerobot_dataset_metadata_factory, info_factory, tmp_p
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
info = info_factory(
|
info = info_factory(
|
||||||
total_episodes=1, total_frames=1, camera_features=camera_features, motor_features=motor_features
|
total_episodes=1,
|
||||||
|
total_frames=1,
|
||||||
|
total_tasks=1,
|
||||||
|
camera_features=camera_features,
|
||||||
|
motor_features=motor_features,
|
||||||
)
|
)
|
||||||
ds_meta = lerobot_dataset_metadata_factory(root=tmp_path / "init", info=info)
|
ds_meta = lerobot_dataset_metadata_factory(root=tmp_path / "init", info=info)
|
||||||
return ds_meta
|
return ds_meta
|
||||||
|
|
|
@ -32,7 +32,7 @@ def test_drop_n_first_frames():
|
||||||
)
|
)
|
||||||
dataset.set_transform(hf_transform_to_torch)
|
dataset.set_transform(hf_transform_to_torch)
|
||||||
episode_data_index = calculate_episode_data_index(dataset)
|
episode_data_index = calculate_episode_data_index(dataset)
|
||||||
sampler = EpisodeAwareSampler(episode_data_index, drop_n_first_frames=1)
|
sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], drop_n_first_frames=1)
|
||||||
assert sampler.indices == [1, 4, 5]
|
assert sampler.indices == [1, 4, 5]
|
||||||
assert len(sampler) == 3
|
assert len(sampler) == 3
|
||||||
assert list(sampler) == [1, 4, 5]
|
assert list(sampler) == [1, 4, 5]
|
||||||
|
@ -48,7 +48,7 @@ def test_drop_n_last_frames():
|
||||||
)
|
)
|
||||||
dataset.set_transform(hf_transform_to_torch)
|
dataset.set_transform(hf_transform_to_torch)
|
||||||
episode_data_index = calculate_episode_data_index(dataset)
|
episode_data_index = calculate_episode_data_index(dataset)
|
||||||
sampler = EpisodeAwareSampler(episode_data_index, drop_n_last_frames=1)
|
sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], drop_n_last_frames=1)
|
||||||
assert sampler.indices == [0, 3, 4]
|
assert sampler.indices == [0, 3, 4]
|
||||||
assert len(sampler) == 3
|
assert len(sampler) == 3
|
||||||
assert list(sampler) == [0, 3, 4]
|
assert list(sampler) == [0, 3, 4]
|
||||||
|
@ -64,7 +64,9 @@ def test_episode_indices_to_use():
|
||||||
)
|
)
|
||||||
dataset.set_transform(hf_transform_to_torch)
|
dataset.set_transform(hf_transform_to_torch)
|
||||||
episode_data_index = calculate_episode_data_index(dataset)
|
episode_data_index = calculate_episode_data_index(dataset)
|
||||||
sampler = EpisodeAwareSampler(episode_data_index, episode_indices_to_use=[0, 2])
|
sampler = EpisodeAwareSampler(
|
||||||
|
episode_data_index["from"], episode_data_index["to"], episode_indices_to_use=[0, 2]
|
||||||
|
)
|
||||||
assert sampler.indices == [0, 1, 3, 4, 5]
|
assert sampler.indices == [0, 1, 3, 4, 5]
|
||||||
assert len(sampler) == 5
|
assert len(sampler) == 5
|
||||||
assert list(sampler) == [0, 1, 3, 4, 5]
|
assert list(sampler) == [0, 1, 3, 4, 5]
|
||||||
|
@ -80,11 +82,11 @@ def test_shuffle():
|
||||||
)
|
)
|
||||||
dataset.set_transform(hf_transform_to_torch)
|
dataset.set_transform(hf_transform_to_torch)
|
||||||
episode_data_index = calculate_episode_data_index(dataset)
|
episode_data_index = calculate_episode_data_index(dataset)
|
||||||
sampler = EpisodeAwareSampler(episode_data_index, shuffle=False)
|
sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], shuffle=False)
|
||||||
assert sampler.indices == [0, 1, 2, 3, 4, 5]
|
assert sampler.indices == [0, 1, 2, 3, 4, 5]
|
||||||
assert len(sampler) == 6
|
assert len(sampler) == 6
|
||||||
assert list(sampler) == [0, 1, 2, 3, 4, 5]
|
assert list(sampler) == [0, 1, 2, 3, 4, 5]
|
||||||
sampler = EpisodeAwareSampler(episode_data_index, shuffle=True)
|
sampler = EpisodeAwareSampler(episode_data_index["from"], episode_data_index["to"], shuffle=True)
|
||||||
assert sampler.indices == [0, 1, 2, 3, 4, 5]
|
assert sampler.indices == [0, 1, 2, 3, 4, 5]
|
||||||
assert len(sampler) == 6
|
assert len(sampler) == 6
|
||||||
assert set(sampler) == {0, 1, 2, 3, 4, 5}
|
assert set(sampler) == {0, 1, 2, 3, 4, 5}
|
||||||
|
|
|
@ -1,21 +0,0 @@
|
||||||
import torch
|
|
||||||
from datasets import Dataset
|
|
||||||
|
|
||||||
from lerobot.common.datasets.push_dataset_to_hub.utils import calculate_episode_data_index
|
|
||||||
from lerobot.common.datasets.utils import (
|
|
||||||
hf_transform_to_torch,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_calculate_episode_data_index():
|
|
||||||
dataset = Dataset.from_dict(
|
|
||||||
{
|
|
||||||
"timestamp": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6],
|
|
||||||
"index": [0, 1, 2, 3, 4, 5],
|
|
||||||
"episode_index": [0, 0, 1, 2, 2, 2],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
dataset.set_transform(hf_transform_to_torch)
|
|
||||||
episode_data_index = calculate_episode_data_index(dataset)
|
|
||||||
assert torch.equal(episode_data_index["from"], torch.tensor([0, 2, 3]))
|
|
||||||
assert torch.equal(episode_data_index["to"], torch.tensor([2, 3, 6]))
|
|
Loading…
Reference in New Issue