Add meta_data, revision v1.1
This commit is contained in:
parent
e2168163cd
commit
0bd2ca8d82
|
@ -4,6 +4,7 @@ useless dependencies when using datasets.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import io
|
import io
|
||||||
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -14,16 +15,20 @@ import numpy as np
|
||||||
import torch
|
import torch
|
||||||
import tqdm
|
import tqdm
|
||||||
from datasets import Dataset, Features, Image, Sequence, Value
|
from datasets import Dataset, Features, Image, Sequence, Value
|
||||||
|
from huggingface_hub import HfApi
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
from safetensors.numpy import save_file
|
||||||
|
|
||||||
|
from lerobot.common.datasets.utils import compute_stats
|
||||||
|
|
||||||
|
|
||||||
def download_and_upload(root, root_tests, dataset_id):
|
def download_and_upload(root, revision, dataset_id):
|
||||||
if "pusht" in dataset_id:
|
if "pusht" in dataset_id:
|
||||||
download_and_upload_pusht(root, root_tests, dataset_id)
|
download_and_upload_pusht(root, revision, dataset_id)
|
||||||
elif "xarm" in dataset_id:
|
elif "xarm" in dataset_id:
|
||||||
download_and_upload_xarm(root, root_tests, dataset_id)
|
download_and_upload_xarm(root, revision, dataset_id)
|
||||||
elif "aloha" in dataset_id:
|
elif "aloha" in dataset_id:
|
||||||
download_and_upload_aloha(root, root_tests, dataset_id)
|
download_and_upload_aloha(root, revision, dataset_id)
|
||||||
else:
|
else:
|
||||||
raise ValueError(dataset_id)
|
raise ValueError(dataset_id)
|
||||||
|
|
||||||
|
@ -56,7 +61,88 @@ def download_and_extract_zip(url: str, destination_folder: Path) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10):
|
def concatenate_episodes(ep_dicts):
|
||||||
|
data_dict = {}
|
||||||
|
|
||||||
|
keys = ep_dicts[0].keys()
|
||||||
|
for key in keys:
|
||||||
|
if torch.is_tensor(ep_dicts[0][key][0]):
|
||||||
|
data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts])
|
||||||
|
else:
|
||||||
|
if key not in data_dict:
|
||||||
|
data_dict[key] = []
|
||||||
|
for ep_dict in ep_dicts:
|
||||||
|
for x in ep_dict[key]:
|
||||||
|
data_dict[key].append(x)
|
||||||
|
|
||||||
|
total_frames = data_dict["frame_id"].shape[0]
|
||||||
|
data_dict["index"] = torch.arange(0, total_frames, 1)
|
||||||
|
return data_dict
|
||||||
|
|
||||||
|
|
||||||
|
def push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id):
|
||||||
|
hf_dataset = hf_dataset.with_format("torch")
|
||||||
|
|
||||||
|
# push to main to indicate latest version
|
||||||
|
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True)
|
||||||
|
|
||||||
|
# push to version branch
|
||||||
|
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision=revision)
|
||||||
|
|
||||||
|
# get stats
|
||||||
|
stats_pth_path = root / dataset_id / "stats.pth"
|
||||||
|
if stats_pth_path.exists():
|
||||||
|
stats = torch.load(stats_pth_path)
|
||||||
|
else:
|
||||||
|
stats = compute_stats(hf_dataset)
|
||||||
|
torch.save(stats, stats_pth_path)
|
||||||
|
|
||||||
|
# create and store meta_data
|
||||||
|
meta_data_dir = root / dataset_id / "train" / "meta_data"
|
||||||
|
meta_data_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
api = HfApi()
|
||||||
|
|
||||||
|
# info
|
||||||
|
info_path = meta_data_dir / "info.json"
|
||||||
|
with open(str(info_path), "w") as f:
|
||||||
|
json.dump(info, f, indent=4)
|
||||||
|
api.upload_file(
|
||||||
|
path_or_fileobj=info_path,
|
||||||
|
path_in_repo=str(info_path).replace(f"{root}/{dataset_id}", ""),
|
||||||
|
repo_id=f"lerobot/{dataset_id}",
|
||||||
|
repo_type="dataset",
|
||||||
|
)
|
||||||
|
|
||||||
|
# stats
|
||||||
|
for key in stats:
|
||||||
|
stats_path = meta_data_dir / f"stats_{key}.safetensors"
|
||||||
|
save_file(episode_data_index, stats_path)
|
||||||
|
api.upload_file(
|
||||||
|
path_or_fileobj=stats_path,
|
||||||
|
path_in_repo=str(stats_path).replace(f"{root}/{dataset_id}", ""),
|
||||||
|
repo_id=f"lerobot/{dataset_id}",
|
||||||
|
repo_type="dataset",
|
||||||
|
)
|
||||||
|
|
||||||
|
# episode_data_index
|
||||||
|
episode_data_index = {key: np.array(episode_data_index[key]) for key in episode_data_index}
|
||||||
|
ep_data_idx_path = meta_data_dir / "episode_data_index.safetensors"
|
||||||
|
save_file(episode_data_index, ep_data_idx_path)
|
||||||
|
api.upload_file(
|
||||||
|
path_or_fileobj=ep_data_idx_path,
|
||||||
|
path_in_repo=str(ep_data_idx_path).replace(f"{root}/{dataset_id}", ""),
|
||||||
|
repo_id=f"lerobot/{dataset_id}",
|
||||||
|
repo_type="dataset",
|
||||||
|
)
|
||||||
|
|
||||||
|
# copy in tests folder, the first episode and the meta_data directory
|
||||||
|
num_items_first_ep = episode_data_index["to"][0] - episode_data_index["from"][0]
|
||||||
|
hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"tests/data/{dataset_id}/train")
|
||||||
|
shutil.copytree(meta_data_dir, f"tests/{meta_data_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_upload_pusht(root, revision, dataset_id="pusht", fps=10):
|
||||||
try:
|
try:
|
||||||
import pymunk
|
import pymunk
|
||||||
from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely
|
from gym_pusht.envs.pusht import PushTEnv, pymunk_to_shapely
|
||||||
|
@ -99,6 +185,7 @@ def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10):
|
||||||
actions = torch.from_numpy(dataset_dict["action"])
|
actions = torch.from_numpy(dataset_dict["action"])
|
||||||
|
|
||||||
ep_dicts = []
|
ep_dicts = []
|
||||||
|
episode_data_index = {"from": [], "to": []}
|
||||||
|
|
||||||
id_from = 0
|
id_from = 0
|
||||||
for episode_id in tqdm.tqdm(range(num_episodes)):
|
for episode_id in tqdm.tqdm(range(num_episodes)):
|
||||||
|
@ -160,28 +247,15 @@ def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10):
|
||||||
"next.reward": torch.cat([reward[1:], reward[[-1]]]),
|
"next.reward": torch.cat([reward[1:], reward[[-1]]]),
|
||||||
"next.done": torch.cat([done[1:], done[[-1]]]),
|
"next.done": torch.cat([done[1:], done[[-1]]]),
|
||||||
"next.success": torch.cat([success[1:], success[[-1]]]),
|
"next.success": torch.cat([success[1:], success[[-1]]]),
|
||||||
"episode_data_index_from": torch.tensor([id_from] * num_frames),
|
|
||||||
"episode_data_index_to": torch.tensor([id_from + num_frames] * num_frames),
|
|
||||||
}
|
}
|
||||||
ep_dicts.append(ep_dict)
|
ep_dicts.append(ep_dict)
|
||||||
|
|
||||||
|
episode_data_index["from"].append(id_from)
|
||||||
|
episode_data_index["to"].append(id_from + num_frames)
|
||||||
|
|
||||||
id_from += num_frames
|
id_from += num_frames
|
||||||
|
|
||||||
data_dict = {}
|
data_dict = concatenate_episodes(ep_dicts)
|
||||||
|
|
||||||
keys = ep_dicts[0].keys()
|
|
||||||
for key in keys:
|
|
||||||
if torch.is_tensor(ep_dicts[0][key][0]):
|
|
||||||
data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts])
|
|
||||||
else:
|
|
||||||
if key not in data_dict:
|
|
||||||
data_dict[key] = []
|
|
||||||
for ep_dict in ep_dicts:
|
|
||||||
for x in ep_dict[key]:
|
|
||||||
data_dict[key].append(x)
|
|
||||||
|
|
||||||
total_frames = id_from
|
|
||||||
data_dict["index"] = torch.arange(0, total_frames, 1)
|
|
||||||
|
|
||||||
features = {
|
features = {
|
||||||
"observation.image": Image(),
|
"observation.image": Image(),
|
||||||
|
@ -196,20 +270,17 @@ def download_and_upload_pusht(root, root_tests, dataset_id="pusht", fps=10):
|
||||||
"next.done": Value(dtype="bool", id=None),
|
"next.done": Value(dtype="bool", id=None),
|
||||||
"next.success": Value(dtype="bool", id=None),
|
"next.success": Value(dtype="bool", id=None),
|
||||||
"index": Value(dtype="int64", id=None),
|
"index": Value(dtype="int64", id=None),
|
||||||
"episode_data_index_from": Value(dtype="int64", id=None),
|
|
||||||
"episode_data_index_to": Value(dtype="int64", id=None),
|
|
||||||
}
|
}
|
||||||
features = Features(features)
|
features = Features(features)
|
||||||
hf_dataset = Dataset.from_dict(data_dict, features=features)
|
hf_dataset = Dataset.from_dict(data_dict, features=features)
|
||||||
hf_dataset = hf_dataset.with_format("torch")
|
|
||||||
|
|
||||||
num_items_first_ep = ep_dicts[0]["frame_id"].shape[0]
|
info = {
|
||||||
hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"{root_tests}/{dataset_id}/train")
|
"fps": fps,
|
||||||
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True)
|
}
|
||||||
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision="v1.0")
|
push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id)
|
||||||
|
|
||||||
|
|
||||||
def download_and_upload_xarm(root, root_tests, dataset_id, fps=15):
|
def download_and_upload_xarm(root, revision, dataset_id, fps=15):
|
||||||
root = Path(root)
|
root = Path(root)
|
||||||
raw_dir = root / f"{dataset_id}_raw"
|
raw_dir = root / f"{dataset_id}_raw"
|
||||||
if not raw_dir.exists():
|
if not raw_dir.exists():
|
||||||
|
@ -234,13 +305,13 @@ def download_and_upload_xarm(root, root_tests, dataset_id, fps=15):
|
||||||
with open(dataset_path, "rb") as f:
|
with open(dataset_path, "rb") as f:
|
||||||
dataset_dict = pickle.load(f)
|
dataset_dict = pickle.load(f)
|
||||||
|
|
||||||
total_frames = dataset_dict["actions"].shape[0]
|
|
||||||
|
|
||||||
ep_dicts = []
|
ep_dicts = []
|
||||||
|
episode_data_index = {"from": [], "to": []}
|
||||||
|
|
||||||
id_from = 0
|
id_from = 0
|
||||||
id_to = 0
|
id_to = 0
|
||||||
episode_id = 0
|
episode_id = 0
|
||||||
|
total_frames = dataset_dict["actions"].shape[0]
|
||||||
for i in tqdm.tqdm(range(total_frames)):
|
for i in tqdm.tqdm(range(total_frames)):
|
||||||
id_to += 1
|
id_to += 1
|
||||||
|
|
||||||
|
@ -271,28 +342,16 @@ def download_and_upload_xarm(root, root_tests, dataset_id, fps=15):
|
||||||
# "next.observation.state": next_state,
|
# "next.observation.state": next_state,
|
||||||
"next.reward": next_reward,
|
"next.reward": next_reward,
|
||||||
"next.done": next_done,
|
"next.done": next_done,
|
||||||
"episode_data_index_from": torch.tensor([id_from] * num_frames),
|
|
||||||
"episode_data_index_to": torch.tensor([id_from + num_frames] * num_frames),
|
|
||||||
}
|
}
|
||||||
ep_dicts.append(ep_dict)
|
ep_dicts.append(ep_dict)
|
||||||
|
|
||||||
|
episode_data_index["from"].append(id_from)
|
||||||
|
episode_data_index["to"].append(id_from + num_frames)
|
||||||
|
|
||||||
id_from = id_to
|
id_from = id_to
|
||||||
episode_id += 1
|
episode_id += 1
|
||||||
|
|
||||||
data_dict = {}
|
data_dict = concatenate_episodes(ep_dicts)
|
||||||
keys = ep_dicts[0].keys()
|
|
||||||
for key in keys:
|
|
||||||
if torch.is_tensor(ep_dicts[0][key][0]):
|
|
||||||
data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts])
|
|
||||||
else:
|
|
||||||
if key not in data_dict:
|
|
||||||
data_dict[key] = []
|
|
||||||
for ep_dict in ep_dicts:
|
|
||||||
for x in ep_dict[key]:
|
|
||||||
data_dict[key].append(x)
|
|
||||||
|
|
||||||
total_frames = id_from
|
|
||||||
data_dict["index"] = torch.arange(0, total_frames, 1)
|
|
||||||
|
|
||||||
features = {
|
features = {
|
||||||
"observation.image": Image(),
|
"observation.image": Image(),
|
||||||
|
@ -307,20 +366,17 @@ def download_and_upload_xarm(root, root_tests, dataset_id, fps=15):
|
||||||
"next.done": Value(dtype="bool", id=None),
|
"next.done": Value(dtype="bool", id=None),
|
||||||
#'next.success': Value(dtype='bool', id=None),
|
#'next.success': Value(dtype='bool', id=None),
|
||||||
"index": Value(dtype="int64", id=None),
|
"index": Value(dtype="int64", id=None),
|
||||||
"episode_data_index_from": Value(dtype="int64", id=None),
|
|
||||||
"episode_data_index_to": Value(dtype="int64", id=None),
|
|
||||||
}
|
}
|
||||||
features = Features(features)
|
features = Features(features)
|
||||||
hf_dataset = Dataset.from_dict(data_dict, features=features)
|
hf_dataset = Dataset.from_dict(data_dict, features=features)
|
||||||
hf_dataset = hf_dataset.with_format("torch")
|
|
||||||
|
|
||||||
num_items_first_ep = ep_dicts[0]["frame_id"].shape[0]
|
info = {
|
||||||
hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"{root_tests}/{dataset_id}/train")
|
"fps": fps,
|
||||||
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True)
|
}
|
||||||
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision="v1.0")
|
push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id)
|
||||||
|
|
||||||
|
|
||||||
def download_and_upload_aloha(root, root_tests, dataset_id, fps=50):
|
def download_and_upload_aloha(root, revision, dataset_id, fps=50):
|
||||||
folder_urls = {
|
folder_urls = {
|
||||||
"aloha_sim_insertion_human": "https://drive.google.com/drive/folders/1RgyD0JgTX30H4IM5XZn8I3zSV_mr8pyF",
|
"aloha_sim_insertion_human": "https://drive.google.com/drive/folders/1RgyD0JgTX30H4IM5XZn8I3zSV_mr8pyF",
|
||||||
"aloha_sim_insertion_scripted": "https://drive.google.com/drive/folders/1TsojQQSXtHEoGnqgJ3gmpPQR2DPLtS2N",
|
"aloha_sim_insertion_scripted": "https://drive.google.com/drive/folders/1TsojQQSXtHEoGnqgJ3gmpPQR2DPLtS2N",
|
||||||
|
@ -381,6 +437,7 @@ def download_and_upload_aloha(root, root_tests, dataset_id, fps=50):
|
||||||
gdown.download(ep49_urls[dataset_id], output=str(raw_dir / "episode_49.hdf5"), fuzzy=True)
|
gdown.download(ep49_urls[dataset_id], output=str(raw_dir / "episode_49.hdf5"), fuzzy=True)
|
||||||
|
|
||||||
ep_dicts = []
|
ep_dicts = []
|
||||||
|
episode_data_index = {"from": [], "to": []}
|
||||||
|
|
||||||
id_from = 0
|
id_from = 0
|
||||||
for ep_id in tqdm.tqdm(range(num_episodes[dataset_id])):
|
for ep_id in tqdm.tqdm(range(num_episodes[dataset_id])):
|
||||||
|
@ -424,24 +481,12 @@ def download_and_upload_aloha(root, root_tests, dataset_id, fps=50):
|
||||||
assert isinstance(ep_id, int)
|
assert isinstance(ep_id, int)
|
||||||
ep_dicts.append(ep_dict)
|
ep_dicts.append(ep_dict)
|
||||||
|
|
||||||
|
episode_data_index["from"].append(id_from)
|
||||||
|
episode_data_index["to"].append(id_from + num_frames)
|
||||||
|
|
||||||
id_from += num_frames
|
id_from += num_frames
|
||||||
|
|
||||||
data_dict = {}
|
data_dict = concatenate_episodes(ep_dicts)
|
||||||
|
|
||||||
data_dict = {}
|
|
||||||
keys = ep_dicts[0].keys()
|
|
||||||
for key in keys:
|
|
||||||
if torch.is_tensor(ep_dicts[0][key][0]):
|
|
||||||
data_dict[key] = torch.cat([ep_dict[key] for ep_dict in ep_dicts])
|
|
||||||
else:
|
|
||||||
if key not in data_dict:
|
|
||||||
data_dict[key] = []
|
|
||||||
for ep_dict in ep_dicts:
|
|
||||||
for x in ep_dict[key]:
|
|
||||||
data_dict[key].append(x)
|
|
||||||
|
|
||||||
total_frames = id_from
|
|
||||||
data_dict["index"] = torch.arange(0, total_frames, 1)
|
|
||||||
|
|
||||||
features = {
|
features = {
|
||||||
"observation.images.top": Image(),
|
"observation.images.top": Image(),
|
||||||
|
@ -456,22 +501,19 @@ def download_and_upload_aloha(root, root_tests, dataset_id, fps=50):
|
||||||
"next.done": Value(dtype="bool", id=None),
|
"next.done": Value(dtype="bool", id=None),
|
||||||
#'next.success': Value(dtype='bool', id=None),
|
#'next.success': Value(dtype='bool', id=None),
|
||||||
"index": Value(dtype="int64", id=None),
|
"index": Value(dtype="int64", id=None),
|
||||||
"episode_data_index_from": Value(dtype="int64", id=None),
|
|
||||||
"episode_data_index_to": Value(dtype="int64", id=None),
|
|
||||||
}
|
}
|
||||||
features = Features(features)
|
features = Features(features)
|
||||||
hf_dataset = Dataset.from_dict(data_dict, features=features)
|
hf_dataset = Dataset.from_dict(data_dict, features=features)
|
||||||
hf_dataset = hf_dataset.with_format("torch")
|
|
||||||
|
|
||||||
num_items_first_ep = ep_dicts[0]["frame_id"].shape[0]
|
info = {
|
||||||
hf_dataset.select(range(num_items_first_ep)).save_to_disk(f"{root_tests}/{dataset_id}/train")
|
"fps": fps,
|
||||||
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True)
|
}
|
||||||
hf_dataset.push_to_hub(f"lerobot/{dataset_id}", token=True, revision="v1.0")
|
push_to_hub(hf_dataset, episode_data_index, info, root, revision, dataset_id)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
root = "data"
|
root = "data"
|
||||||
root_tests = "tests/data"
|
revision = "v1.1"
|
||||||
|
|
||||||
dataset_ids = [
|
dataset_ids = [
|
||||||
# "pusht",
|
# "pusht",
|
||||||
|
@ -482,6 +524,4 @@ if __name__ == "__main__":
|
||||||
"aloha_sim_transfer_cube_scripted",
|
"aloha_sim_transfer_cube_scripted",
|
||||||
]
|
]
|
||||||
for dataset_id in dataset_ids:
|
for dataset_id in dataset_ids:
|
||||||
download_and_upload(root, root_tests, dataset_id)
|
download_and_upload(root, revision, dataset_id)
|
||||||
# assume stats have been precomputed
|
|
||||||
shutil.copy(f"{root}/{dataset_id}/stats.pth", f"{root_tests}/{dataset_id}/stats.pth")
|
|
||||||
|
|
Loading…
Reference in New Issue