Merge pull request #36 from Cadene/user/rcadene/2024_03_19_replay_buffer_folder
Add replay_buffer directory and dataset versioning
This commit is contained in:
commit
2d5abbbd6f
51
README.md
51
README.md
|
@ -138,7 +138,7 @@ git lfs pull
|
||||||
|
|
||||||
When adding a new dataset, mock it with
|
When adding a new dataset, mock it with
|
||||||
```
|
```
|
||||||
python tests/scripts/mock_dataset.py --in-data-dir data/<dataset_id> --out-data-dir tests/data/<dataset_id>
|
python tests/scripts/mock_dataset.py --in-data-dir data/$DATASET --out-data-dir tests/data/$DATASET
|
||||||
```
|
```
|
||||||
|
|
||||||
Run tests
|
Run tests
|
||||||
|
@ -148,22 +148,65 @@ DATA_DIR="tests/data" pytest -sx tests
|
||||||
|
|
||||||
**Datasets**
|
**Datasets**
|
||||||
|
|
||||||
To add a pytorch rl dataset to the hub, first login and use a token generated from [huggingface settings](https://huggingface.co/settings/tokens) with write access:
|
To add a dataset to the hub, first login and use a token generated from [huggingface settings](https://huggingface.co/settings/tokens) with write access:
|
||||||
```
|
```
|
||||||
huggingface-cli login --token $HUGGINGFACE_TOKEN --add-to-git-credential
|
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
|
||||||
```
|
```
|
||||||
|
|
||||||
Then you can upload it to the hub with:
|
Then you can upload it to the hub with:
|
||||||
```
|
```
|
||||||
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload --repo-type dataset $HF_USER/$DATASET data/$DATASET
|
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload $HF_USER/$DATASET data/$DATASET \
|
||||||
|
--repo-type dataset \
|
||||||
|
--revision v1.0
|
||||||
```
|
```
|
||||||
|
|
||||||
|
You will need to set the corresponding version as a default argument in your dataset class:
|
||||||
|
```python
|
||||||
|
version: str | None = "v1.0",
|
||||||
|
```
|
||||||
|
See: [`lerobot/common/datasets/pusht.py`](https://github.com/Cadene/lerobot/blob/main/lerobot/common/datasets/pusht.py)
|
||||||
|
|
||||||
For instance, for [cadene/pusht](https://huggingface.co/datasets/cadene/pusht), we used:
|
For instance, for [cadene/pusht](https://huggingface.co/datasets/cadene/pusht), we used:
|
||||||
```
|
```
|
||||||
HF_USER=cadene
|
HF_USER=cadene
|
||||||
DATASET=pusht
|
DATASET=pusht
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you want to improve an existing dataset, you can download it locally with:
|
||||||
|
```
|
||||||
|
mkdir -p data/$DATASET
|
||||||
|
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download ${HF_USER}/$DATASET \
|
||||||
|
--repo-type dataset \
|
||||||
|
--local-dir data/$DATASET \
|
||||||
|
--local-dir-use-symlinks=False \
|
||||||
|
--revision v1.0
|
||||||
|
```
|
||||||
|
|
||||||
|
Iterate on your code and dataset with:
|
||||||
|
```
|
||||||
|
DATA_DIR=data python train.py
|
||||||
|
```
|
||||||
|
|
||||||
|
Upload a new version (v2.0 or v1.1 if the changes are respectively more or less significant):
|
||||||
|
```
|
||||||
|
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload $HF_USER/$DATASET data/$DATASET \
|
||||||
|
--repo-type dataset \
|
||||||
|
--revision v1.1 \
|
||||||
|
--delete "*"
|
||||||
|
```
|
||||||
|
|
||||||
|
Then you will need to set the corresponding version as a default argument in your dataset class:
|
||||||
|
```python
|
||||||
|
version: str | None = "v1.1",
|
||||||
|
```
|
||||||
|
See: [`lerobot/common/datasets/pusht.py`](https://github.com/Cadene/lerobot/blob/main/lerobot/common/datasets/pusht.py)
|
||||||
|
|
||||||
|
|
||||||
|
Finally, you might want to mock the dataset if you need to update the unit tests as well:
|
||||||
|
```
|
||||||
|
python tests/scripts/mock_dataset.py --in-data-dir data/$DATASET --out-data-dir tests/data/$DATASET
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
## Acknowledgment
|
## Acknowledgment
|
||||||
- Our Diffusion policy and Pusht environment are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu/)
|
- Our Diffusion policy and Pusht environment are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu/)
|
||||||
|
|
|
@ -19,6 +19,7 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
|
version: str | None = None,
|
||||||
batch_size: int = None,
|
batch_size: int = None,
|
||||||
*,
|
*,
|
||||||
shuffle: bool = True,
|
shuffle: bool = True,
|
||||||
|
@ -31,8 +32,15 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
|
||||||
transform: "torchrl.envs.Transform" = None,
|
transform: "torchrl.envs.Transform" = None,
|
||||||
):
|
):
|
||||||
self.dataset_id = dataset_id
|
self.dataset_id = dataset_id
|
||||||
|
self.version = version
|
||||||
self.shuffle = shuffle
|
self.shuffle = shuffle
|
||||||
self.root = root
|
self.root = root
|
||||||
|
|
||||||
|
if self.root is not None and self.version is not None:
|
||||||
|
logging.warning(
|
||||||
|
f"The version of the dataset ({self.version}) is not enforced when root is provided ({self.root})."
|
||||||
|
)
|
||||||
|
|
||||||
storage = self._download_or_load_dataset()
|
storage = self._download_or_load_dataset()
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
|
@ -96,10 +104,14 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
|
||||||
|
|
||||||
def _download_or_load_dataset(self) -> torch.StorageBase:
|
def _download_or_load_dataset(self) -> torch.StorageBase:
|
||||||
if self.root is None:
|
if self.root is None:
|
||||||
self.data_dir = Path(snapshot_download(repo_id=f"cadene/{self.dataset_id}", repo_type="dataset"))
|
self.data_dir = Path(
|
||||||
|
snapshot_download(
|
||||||
|
repo_id=f"cadene/{self.dataset_id}", repo_type="dataset", revision=self.version
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
self.data_dir = self.root / self.dataset_id
|
self.data_dir = self.root / self.dataset_id
|
||||||
return TensorStorage(TensorDict.load_memmap(self.data_dir))
|
return TensorStorage(TensorDict.load_memmap(self.data_dir / "replay_buffer"))
|
||||||
|
|
||||||
def _compute_stats(self, num_batch=100, batch_size=32):
|
def _compute_stats(self, num_batch=100, batch_size=32):
|
||||||
rb = TensorDictReplayBuffer(
|
rb = TensorDictReplayBuffer(
|
||||||
|
|
|
@ -84,6 +84,7 @@ class AlohaExperienceReplay(AbstractExperienceReplay):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
|
version: str | None = "v1.0",
|
||||||
batch_size: int = None,
|
batch_size: int = None,
|
||||||
*,
|
*,
|
||||||
shuffle: bool = True,
|
shuffle: bool = True,
|
||||||
|
@ -99,6 +100,7 @@ class AlohaExperienceReplay(AbstractExperienceReplay):
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
dataset_id,
|
dataset_id,
|
||||||
|
version,
|
||||||
batch_size,
|
batch_size,
|
||||||
shuffle=shuffle,
|
shuffle=shuffle,
|
||||||
root=root,
|
root=root,
|
||||||
|
|
|
@ -87,6 +87,7 @@ class PushtExperienceReplay(AbstractExperienceReplay):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
|
version: str | None = "v1.0",
|
||||||
batch_size: int = None,
|
batch_size: int = None,
|
||||||
*,
|
*,
|
||||||
shuffle: bool = True,
|
shuffle: bool = True,
|
||||||
|
@ -100,6 +101,7 @@ class PushtExperienceReplay(AbstractExperienceReplay):
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
dataset_id,
|
dataset_id,
|
||||||
|
version,
|
||||||
batch_size,
|
batch_size,
|
||||||
shuffle=shuffle,
|
shuffle=shuffle,
|
||||||
root=root,
|
root=root,
|
||||||
|
|
|
@ -40,6 +40,7 @@ class SimxarmExperienceReplay(AbstractExperienceReplay):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
|
version: str | None = None,
|
||||||
batch_size: int = None,
|
batch_size: int = None,
|
||||||
*,
|
*,
|
||||||
shuffle: bool = True,
|
shuffle: bool = True,
|
||||||
|
@ -53,6 +54,7 @@ class SimxarmExperienceReplay(AbstractExperienceReplay):
|
||||||
):
|
):
|
||||||
super().__init__(
|
super().__init__(
|
||||||
dataset_id,
|
dataset_id,
|
||||||
|
version,
|
||||||
batch_size,
|
batch_size,
|
||||||
shuffle=shuffle,
|
shuffle=shuffle,
|
||||||
root=root,
|
root=root,
|
||||||
|
|
|
@ -1,5 +1,18 @@
|
||||||
"""
|
"""
|
||||||
usage: `python tests/scripts/mock_dataset.py --in-data-dir data/pusht --out-data-dir tests/data/pusht`
|
This script is designed to facilitate the creation of a subset of an existing dataset by selecting a specific number of frames from the original dataset.
|
||||||
|
This subset can then be used for running quick unit tests.
|
||||||
|
The script takes an input directory containing the original dataset and an output directory where the subset of the dataset will be saved.
|
||||||
|
Additionally, the number of frames to include in the subset can be specified.
|
||||||
|
The script ensures that the subset is a representative sample of the original dataset by copying the specified number of frames and retaining the structure and format of the data.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
Run the script with the following command, specifying the path to the input data directory,
|
||||||
|
the path to the output data directory, and optionally the number of frames to include in the subset dataset:
|
||||||
|
|
||||||
|
`python tests/scripts/mock_dataset.py --in-data-dir path/to/input_data --out-data-dir path/to/output_data`
|
||||||
|
|
||||||
|
Example:
|
||||||
|
`python tests/scripts/mock_dataset.py --in-data-dir data/pusht --out-data-dir tests/data/pusht`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
@ -9,13 +22,16 @@ from tensordict import TensorDict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
def mock_dataset(in_data_dir, out_data_dir, num_frames=50):
|
def mock_dataset(in_data_dir, out_data_dir, num_frames):
|
||||||
|
in_data_dir = Path(in_data_dir)
|
||||||
|
out_data_dir = Path(out_data_dir)
|
||||||
|
|
||||||
# load full dataset as a tensor dict
|
# load full dataset as a tensor dict
|
||||||
in_td_data = TensorDict.load_memmap(in_data_dir)
|
in_td_data = TensorDict.load_memmap(in_data_dir / "replay_buffer")
|
||||||
|
|
||||||
# use 1 frame to know the specification of the dataset
|
# use 1 frame to know the specification of the dataset
|
||||||
# and copy it over `n` frames in the test artifact directory
|
# and copy it over `n` frames in the test artifact directory
|
||||||
out_td_data = in_td_data[0].expand(num_frames).memmap_like(out_data_dir)
|
out_td_data = in_td_data[0].expand(num_frames).memmap_like(out_data_dir / "replay_buffer")
|
||||||
|
|
||||||
# copy the first `n` frames so that we have real data
|
# copy the first `n` frames so that we have real data
|
||||||
out_td_data[:num_frames] = in_td_data[:num_frames].clone()
|
out_td_data[:num_frames] = in_td_data[:num_frames].clone()
|
||||||
|
@ -24,18 +40,19 @@ def mock_dataset(in_data_dir, out_data_dir, num_frames=50):
|
||||||
out_td_data.lock_()
|
out_td_data.lock_()
|
||||||
|
|
||||||
# copy the full statistics of dataset since it's pretty small
|
# copy the full statistics of dataset since it's pretty small
|
||||||
in_stats_path = Path(in_data_dir) / "stats.pth"
|
in_stats_path = in_data_dir / "stats.pth"
|
||||||
out_stats_path = Path(out_data_dir) / "stats.pth"
|
out_stats_path = out_data_dir / "stats.pth"
|
||||||
shutil.copy(in_stats_path, out_stats_path)
|
shutil.copy(in_stats_path, out_stats_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Create dataset")
|
parser = argparse.ArgumentParser(description="Create a dataset with a subset of frames for quick testing.")
|
||||||
|
|
||||||
parser.add_argument("--in-data-dir", type=str, help="Path to input data")
|
parser.add_argument("--in-data-dir", type=str, help="Path to input data")
|
||||||
parser.add_argument("--out-data-dir", type=str, help="Path to save the output data")
|
parser.add_argument("--out-data-dir", type=str, help="Path to save the output data")
|
||||||
|
parser.add_argument("--num-frames", type=int, default=50, help="Number of frames to copy over")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
mock_dataset(args.in_data_dir, args.out_data_dir)
|
mock_dataset(args.in_data_dir, args.out_data_dir, args.num_frames)
|
Loading…
Reference in New Issue