Merge pull request #36 from Cadene/user/rcadene/2024_03_19_replay_buffer_folder

Add replay_buffer directory and dataset versioning
This commit is contained in:
Remi 2024-03-19 18:04:29 +01:00 committed by GitHub
commit 2d5abbbd6f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
76 changed files with 92 additions and 14 deletions

View File

@ -138,7 +138,7 @@ git lfs pull
When adding a new dataset, mock it with
```
python tests/scripts/mock_dataset.py --in-data-dir data/<dataset_id> --out-data-dir tests/data/<dataset_id>
python tests/scripts/mock_dataset.py --in-data-dir data/$DATASET --out-data-dir tests/data/$DATASET
```
Run tests
@ -148,22 +148,65 @@ DATA_DIR="tests/data" pytest -sx tests
**Datasets**
To add a pytorch rl dataset to the hub, first login and use a token generated from [huggingface settings](https://huggingface.co/settings/tokens) with write access:
To add a dataset to the hub, first login and use a token generated from [huggingface settings](https://huggingface.co/settings/tokens) with write access:
```
huggingface-cli login --token $HUGGINGFACE_TOKEN --add-to-git-credential
huggingface-cli login --token ${HUGGINGFACE_TOKEN} --add-to-git-credential
```
Then you can upload it to the hub with:
```
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload --repo-type dataset $HF_USER/$DATASET data/$DATASET
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload $HF_USER/$DATASET data/$DATASET \
--repo-type dataset \
--revision v1.0
```
You will need to set the corresponding version as a default argument in your dataset class:
```python
version: str | None = "v1.0",
```
See: [`lerobot/common/datasets/pusht.py`](https://github.com/Cadene/lerobot/blob/main/lerobot/common/datasets/pusht.py)
For instance, for [cadene/pusht](https://huggingface.co/datasets/cadene/pusht), we used:
```
HF_USER=cadene
DATASET=pusht
```
If you want to improve an existing dataset, you can download it locally with:
```
mkdir -p data/$DATASET
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download ${HF_USER}/$DATASET \
--repo-type dataset \
--local-dir data/$DATASET \
--local-dir-use-symlinks=False \
--revision v1.0
```
Iterate on your code and dataset with:
```
DATA_DIR=data python train.py
```
Upload a new version (v2.0 or v1.1 if the changes are respectively more or less significant):
```
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli upload $HF_USER/$DATASET data/$DATASET \
--repo-type dataset \
--revision v1.1 \
--delete "*"
```
Then you will need to set the corresponding version as a default argument in your dataset class:
```python
version: str | None = "v1.1",
```
See: [`lerobot/common/datasets/pusht.py`](https://github.com/Cadene/lerobot/blob/main/lerobot/common/datasets/pusht.py)
Finally, you might want to mock the dataset if you need to update the unit tests as well:
```
python tests/scripts/mock_dataset.py --in-data-dir data/$DATASET --out-data-dir tests/data/$DATASET
```
## Acknowledgment
- Our Diffusion policy and Pusht environment are adapted from [Diffusion Policy](https://diffusion-policy.cs.columbia.edu/)

View File

@ -19,6 +19,7 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
def __init__(
self,
dataset_id: str,
version: str | None = None,
batch_size: int = None,
*,
shuffle: bool = True,
@ -31,8 +32,15 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
transform: "torchrl.envs.Transform" = None,
):
self.dataset_id = dataset_id
self.version = version
self.shuffle = shuffle
self.root = root
if self.root is not None and self.version is not None:
logging.warning(
f"The version of the dataset ({self.version}) is not enforced when root is provided ({self.root})."
)
storage = self._download_or_load_dataset()
super().__init__(
@ -96,10 +104,14 @@ class AbstractExperienceReplay(TensorDictReplayBuffer):
def _download_or_load_dataset(self) -> torch.StorageBase:
if self.root is None:
self.data_dir = Path(snapshot_download(repo_id=f"cadene/{self.dataset_id}", repo_type="dataset"))
self.data_dir = Path(
snapshot_download(
repo_id=f"cadene/{self.dataset_id}", repo_type="dataset", revision=self.version
)
)
else:
self.data_dir = self.root / self.dataset_id
return TensorStorage(TensorDict.load_memmap(self.data_dir))
return TensorStorage(TensorDict.load_memmap(self.data_dir / "replay_buffer"))
def _compute_stats(self, num_batch=100, batch_size=32):
rb = TensorDictReplayBuffer(

View File

@ -84,6 +84,7 @@ class AlohaExperienceReplay(AbstractExperienceReplay):
def __init__(
self,
dataset_id: str,
version: str | None = "v1.0",
batch_size: int = None,
*,
shuffle: bool = True,
@ -99,6 +100,7 @@ class AlohaExperienceReplay(AbstractExperienceReplay):
super().__init__(
dataset_id,
version,
batch_size,
shuffle=shuffle,
root=root,

View File

@ -87,6 +87,7 @@ class PushtExperienceReplay(AbstractExperienceReplay):
def __init__(
self,
dataset_id: str,
version: str | None = "v1.0",
batch_size: int = None,
*,
shuffle: bool = True,
@ -100,6 +101,7 @@ class PushtExperienceReplay(AbstractExperienceReplay):
):
super().__init__(
dataset_id,
version,
batch_size,
shuffle=shuffle,
root=root,

View File

@ -40,6 +40,7 @@ class SimxarmExperienceReplay(AbstractExperienceReplay):
def __init__(
self,
dataset_id: str,
version: str | None = None,
batch_size: int = None,
*,
shuffle: bool = True,
@ -53,6 +54,7 @@ class SimxarmExperienceReplay(AbstractExperienceReplay):
):
super().__init__(
dataset_id,
version,
batch_size,
shuffle=shuffle,
root=root,

View File

@ -1,5 +1,18 @@
"""
usage: `python tests/scripts/mock_dataset.py --in-data-dir data/pusht --out-data-dir tests/data/pusht`
This script is designed to facilitate the creation of a subset of an existing dataset by selecting a specific number of frames from the original dataset.
This subset can then be used for running quick unit tests.
The script takes an input directory containing the original dataset and an output directory where the subset of the dataset will be saved.
Additionally, the number of frames to include in the subset can be specified.
The script ensures that the subset is a representative sample of the original dataset by copying the specified number of frames and retaining the structure and format of the data.
Usage:
Run the script with the following command, specifying the path to the input data directory,
the path to the output data directory, and optionally the number of frames to include in the subset dataset:
`python tests/scripts/mock_dataset.py --in-data-dir path/to/input_data --out-data-dir path/to/output_data`
Example:
`python tests/scripts/mock_dataset.py --in-data-dir data/pusht --out-data-dir tests/data/pusht`
"""
import argparse
@ -9,13 +22,16 @@ from tensordict import TensorDict
from pathlib import Path
def mock_dataset(in_data_dir, out_data_dir, num_frames=50):
def mock_dataset(in_data_dir, out_data_dir, num_frames):
in_data_dir = Path(in_data_dir)
out_data_dir = Path(out_data_dir)
# load full dataset as a tensor dict
in_td_data = TensorDict.load_memmap(in_data_dir)
in_td_data = TensorDict.load_memmap(in_data_dir / "replay_buffer")
# use 1 frame to know the specification of the dataset
# and copy it over `n` frames in the test artifact directory
out_td_data = in_td_data[0].expand(num_frames).memmap_like(out_data_dir)
out_td_data = in_td_data[0].expand(num_frames).memmap_like(out_data_dir / "replay_buffer")
# copy the first `n` frames so that we have real data
out_td_data[:num_frames] = in_td_data[:num_frames].clone()
@ -24,18 +40,19 @@ def mock_dataset(in_data_dir, out_data_dir, num_frames=50):
out_td_data.lock_()
# copy the full statistics of dataset since it's pretty small
in_stats_path = Path(in_data_dir) / "stats.pth"
out_stats_path = Path(out_data_dir) / "stats.pth"
in_stats_path = in_data_dir / "stats.pth"
out_stats_path = out_data_dir / "stats.pth"
shutil.copy(in_stats_path, out_stats_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create dataset")
parser = argparse.ArgumentParser(description="Create a dataset with a subset of frames for quick testing.")
parser.add_argument("--in-data-dir", type=str, help="Path to input data")
parser.add_argument("--out-data-dir", type=str, help="Path to save the output data")
parser.add_argument("--num-frames", type=int, default=50, help="Number of frames to copy over")
args = parser.parse_args()
mock_dataset(args.in_data_dir, args.out_data_dir)
mock_dataset(args.in_data_dir, args.out_data_dir, args.num_frames)