diff --git a/download_and_upload_dataset.py b/download_and_upload_dataset.py index d0d35771..6061a450 100644 --- a/download_and_upload_dataset.py +++ b/download_and_upload_dataset.py @@ -4,6 +4,7 @@ useless dependencies when using datasets. """ import io +import json import pickle import shutil from pathlib import Path @@ -14,16 +15,20 @@ import numpy as np import torch import tqdm from datasets import Dataset, Features, Image, Sequence, Value +from huggingface_hub import HfApi from PIL import Image as PILImage +from safetensors.numpy import save_file + +from lerobot.common.datasets.utils import compute_stats -def download_and_upload(root, root_tests, dataset_id): +def download_and_upload(root, revision, dataset_id): if "pusht" in dataset_id: - download_and_upload_pusht(root, root_tests, dataset_id) + download_and_upload_pusht(root, revision, dataset_id) elif "xarm" in dataset_id: - download_and_upload_xarm(root, root_tests, dataset_id) + download_and_upload_xarm(root, revision, dataset_id) elif "aloha" in dataset_id: - download_and_upload_aloha(root, root_tests, dataset_id) + download_and_upload_aloha(root, revision, dataset_id) else: raise ValueError(dataset_id) @@ -56,7 +61,88 @@ def download_and_extract_zip(url: str, destination_folder: Path) -> bool: return False -def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10): +def concatenate_episodes(ep_dicts): + data_dict = {} + + keys = ep_dicts[0].keys() + for key in keys: + if torch.is_tensor(ep_dicts[0][key][0]): + data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts]) + else: + if key not in data_dict: + data_dict[key] = [] + for ep_dict in ep_dicts: + for x in ep_dict[key]: + data_dict[key].append(x) + + total_frames = data_dict["frame_id"].shape[0] + data_dict["index"] = torch.arange(0, total_frames, 1) + return data_dict + + +def push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id): + hf_dataset = hf_dataset.with_format("torch") + + # push to main to indicate latest version + hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True) + + # push to version branch + hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision=revision) + + # get stats + stats_pth_path = root / dataset_id / "stats.pth" + if stats_pth_path.exists(): + stats = torch.load(stats_pth_path) + else: + stats = compute_stats(hf_dataset) + torch.save(stats, stats_pth_path) + + # create and store meta_data + meta_data_dir = root / dataset_id / "train" / "meta_data" + meta_data_dir.mkdir(parents=True, exist_ok=True) + + api = HfApi() + + # info + info_path = meta_data_dir / "info.json" + with open(str(info_path), "w") as f: + json.dump(info, f, indent=4) + api.upload_file( + path_or_fileobj=info_path, + path_in_repo=str(info_path).replace(f"{root}/{dataset_id}", ""), + repo_id=f"lerobot/{dataset_id}", + repo_type="dataset", + ) + + # stats + for key in stats: + stats_path = meta_data_dir / f"stats_{key}.safetensors" + save_file(episode_data_index, stats_path) + api.upload_file( + path_or_fileobj=stats_path, + path_in_repo=str(stats_path).replace(f"{root}/{dataset_id}", ""), + repo_id=f"lerobot/{dataset_id}", + repo_type="dataset", + ) + + # episode_data_index + episode_data_index = {key: np.array(episode_data_index[key]) for key in episode_data_index} + ep_data_idx_path = meta_data_dir / "episode_data_index.safetensors" + save_file(episode_data_index, ep_data_idx_path) + api.upload_file( + path_or_fileobj=ep_data_idx_path, + path_in_repo=str(ep_data_idx_path).replace(f"{root}/{dataset_id}", ""), + repo_id=f"lerobot/{dataset_id}", + repo_type="dataset", + ) + + # copy in tests folder, the first episode and the meta_data directory + num_items_first_ep = episode_data_index["to"][0] - episode_data_index["from"][0] + hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"tests/data/{dataset_id}/train") + shutil.copytree(meta_data_dir, f"tests/{meta_data_dir}") + + +def download_and_upload_pusht(root, revision, dataset_id="pusht", fps=10): try: import pymunk from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely @@ -99,6 +185,7 @@ def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10): actions = torch.from_numpy(dataset_dict["action"]) ep_dicts = [] + episode_data_index = {"from": [], "to": []} id_from = 0 for episode_id in tqdm.tqdm(range(num_episodes)): @@ -160,28 +247,15 @@ def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10): "next.reward": torch.cat([reward[1:], reward[[-1]]]), "next.done": torch.cat([done[1:], done[[-1]]]), "next.success": torch.cat([success[1:], success[[-1]]]), - "episode_data_index_from": torch.tensor([id_from] * num_frames), - "episode_data_index_to": torch.tensor([id_from + num_frames] * num_frames), } ep_dicts.append(ep_dict) + episode_data_index["from"].append(id_from) + episode_data_index["to"].append(id_from + num_frames) + id_from += num_frames - data_dict = {} - - keys = ep_dicts[0].keys() - for key in keys: - if torch.is_tensor(ep_dicts[0][key][0]): - data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts]) - else: - if key not in data_dict: - data_dict[key] = [] - for ep_dict in ep_dicts: - for x in ep_dict[key]: - data_dict[key].append(x) - - total_frames = id_from - data_dict["index"] = torch.arange(0, total_frames, 1) + data_dict = concatenate_episodes(ep_dicts) features = { "observation.image": Image(), @@ -196,20 +270,17 @@ def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10): "next.done": Value(dtype="bool", id=None), "next.success": Value(dtype="bool", id=None), "index": Value(dtype="int64", id=None), - "episode_data_index_from": Value(dtype="int64", id=None), - "episode_data_index_to": Value(dtype="int64", id=None), } features = Features(features) hf_dataset = Dataset.from_dict(data_dict, features=features) - hf_dataset = hf_dataset.with_format("torch") - num_items_first_ep = ep_dicts[0]["frame_id"].shape[0] - hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"{root_tests}/{dataset_id}/train") - hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True) - hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision="v1.0") + info = { + "fps": fps, + } + push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id) -def download_and_upload_xarm(root, root_tests, dataset_id, fps=15): +def download_and_upload_xarm(root, revision, dataset_id, fps=15): root = Path(root) raw_dir = root / f"{dataset_id}_raw" if not raw_dir.exists(): @@ -234,13 +305,13 @@ def download_and_upload_xarm(root, root_tests, dataset_id, fps=15): with open(dataset_path, "rb") as f: dataset_dict = pickle.load(f) - total_frames = dataset_dict["actions"].shape[0] - ep_dicts = [] + episode_data_index = {"from": [], "to": []} id_from = 0 id_to = 0 episode_id = 0 + total_frames = dataset_dict["actions"].shape[0] for i in tqdm.tqdm(range(total_frames)): id_to += 1 @@ -271,28 +342,16 @@ def download_and_upload_xarm(root, root_tests, dataset_id, fps=15): # "next.observation.state": next_state, "next.reward": next_reward, "next.done": next_done, - "episode_data_index_from": torch.tensor([id_from] * num_frames), - "episode_data_index_to": torch.tensor([id_from + num_frames] * num_frames), } ep_dicts.append(ep_dict) + episode_data_index["from"].append(id_from) + episode_data_index["to"].append(id_from + num_frames) + id_from = id_to episode_id += 1 - data_dict = {} - keys = ep_dicts[0].keys() - for key in keys: - if torch.is_tensor(ep_dicts[0][key][0]): - data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts]) - else: - if key not in data_dict: - data_dict[key] = [] - for ep_dict in ep_dicts: - for x in ep_dict[key]: - data_dict[key].append(x) - - total_frames = id_from - data_dict["index"] = torch.arange(0, total_frames, 1) + data_dict = concatenate_episodes(ep_dicts) features = { "observation.image": Image(), @@ -307,20 +366,17 @@ def download_and_upload_xarm(root, root_tests, dataset_id, fps=15): "next.done": Value(dtype="bool", id=None), #'next.success': Value(dtype='bool', id=None), "index": Value(dtype="int64", id=None), - "episode_data_index_from": Value(dtype="int64", id=None), - "episode_data_index_to": Value(dtype="int64", id=None), } features = Features(features) hf_dataset = Dataset.from_dict(data_dict, features=features) - hf_dataset = hf_dataset.with_format("torch") - num_items_first_ep = ep_dicts[0]["frame_id"].shape[0] - hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"{root_tests}/{dataset_id}/train") - hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True) - hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision="v1.0") + info = { + "fps": fps, + } + push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id) -def download_and_upload_aloha(root, root_tests, dataset_id, fps=50): +def download_and_upload_aloha(root, revision, dataset_id, fps=50): folder_urls = { "aloha_sim_insertion_human": "https://drive.google.com/drive/folders/1RgyD0JgTX30H4IM5XZn8I3zSV_mr8pyF", "aloha_sim_insertion_scripted": "https://drive.google.com/drive/folders/1TsojQQSXtHEoGnqgJ3gmpPQR2DPLtS2N", @@ -381,6 +437,7 @@ def download_and_upload_aloha(root, root_tests, dataset_id, fps=50): gdown.download(ep49_urls[dataset_id], output=str(raw_dir / "episode_49.hdf5"), fuzzy=True) ep_dicts = [] + episode_data_index = {"from": [], "to": []} id_from = 0 for ep_id in tqdm.tqdm(range(num_episodes[dataset_id])): @@ -424,24 +481,12 @@ def download_and_upload_aloha(root, root_tests, dataset_id, fps=50): assert isinstance(ep_id, int) ep_dicts.append(ep_dict) + episode_data_index["from"].append(id_from) + episode_data_index["to"].append(id_from + num_frames) + id_from += num_frames - data_dict = {} - - data_dict = {} - keys = ep_dicts[0].keys() - for key in keys: - if torch.is_tensor(ep_dicts[0][key][0]): - data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts]) - else: - if key not in data_dict: - data_dict[key] = [] - for ep_dict in ep_dicts: - for x in ep_dict[key]: - data_dict[key].append(x) - - total_frames = id_from - data_dict["index"] = torch.arange(0, total_frames, 1) + data_dict = concatenate_episodes(ep_dicts) features = { "observation.images.top": Image(), @@ -456,22 +501,19 @@ def download_and_upload_aloha(root, root_tests, dataset_id, fps=50): "next.done": Value(dtype="bool", id=None), #'next.success': Value(dtype='bool', id=None), "index": Value(dtype="int64", id=None), - "episode_data_index_from": Value(dtype="int64", id=None), - "episode_data_index_to": Value(dtype="int64", id=None), } features = Features(features) hf_dataset = Dataset.from_dict(data_dict, features=features) - hf_dataset = hf_dataset.with_format("torch") - num_items_first_ep = ep_dicts[0]["frame_id"].shape[0] - hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"{root_tests}/{dataset_id}/train") - hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True) - hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision="v1.0") + info = { + "fps": fps, + } + push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id) if __name__ == "__main__": root = "data" - root_tests = "tests/data" + revision = "v1.1" dataset_ids = [ # "pusht", @@ -482,6 +524,4 @@ if __name__ == "__main__": "aloha_sim_transfer_cube_scripted", ] for dataset_id in dataset_ids: - download_and_upload(root, root_tests, dataset_id) - # assume stats have been precomputed - shutil.copy(f"{root}/{dataset_id}/stats.pth", f"{root_tests}/{dataset_id}/stats.pth") + download_and_upload(root, revision, dataset_id)