Move default paths, use jsonlines for tasks

2024-10-18 17:53:25 +02:00 · 2024-10-18 17:53:25 +02:00 · ac3798bd62
parent bce3dc3bfa
commit ac3798bd62
3 changed files with 55 additions and 17 deletions
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import json
 import logging
 import os
 from pathlib import Path
@ -27,6 +28,7 @@ from lerobot.common.datasets.compute_stats import aggregate_stats
 from lerobot.common.datasets.utils import (
    check_delta_timestamps,
    check_timestamps_sync,
+    create_dataset_info,
    get_delta_indices,
    get_episode_data_index,
    get_hub_safe_version,
@ -34,17 +36,12 @@ from lerobot.common.datasets.utils import (
    load_metadata,
 )
 from lerobot.common.datasets.video_utils import VideoFrame, decode_video_frames_torchvision
+from lerobot.common.robot_devices.robots.utils import Robot

 # For maintainers, see lerobot/common/datasets/push_dataset_to_hub/CODEBASE_VERSION.md
 CODEBASE_VERSION = "v2.0"
 LEROBOT_HOME = Path(os.getenv("LEROBOT_HOME", "~/.cache/huggingface/lerobot")).expanduser()

-DEFAULT_CHUNK_SIZE = 1000
-DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
-DEFAULT_PARQUET_PATH = (
-    "data/chunk-{episode_chunk:03d}/train-{episode_index:05d}-of-{total_episodes:05d}.parquet"
-)
-

 class LeRobotDataset(torch.utils.data.Dataset):
    def __init__(
@ -400,6 +397,10 @@ class LeRobotDataset(torch.utils.data.Dataset):

        return item

+    def write_info(self) -> None:
+        with open(self.root / "meta/info.json", "w") as f:
+            json.dump(self.info, f, indent=4, ensure_ascii=False)
+
    def __repr__(self):
        return (
            f"{self.__class__.__name__}(\n"
@ -419,17 +420,22 @@ class LeRobotDataset(torch.utils.data.Dataset):
    def create(
        cls,
        repo_id: str,
+        fps: int,
+        robot: Robot,
        root: Path | None = None,
-        image_transforms: Callable | None = None,
-        delta_timestamps: dict[list[float]] | None = None,
        tolerance_s: float = 1e-4,
-        video_backend: str | None = None,
    ) -> "LeRobotDataset":
        """Create a LeRobot Dataset from scratch in order to record data."""
-        # create an empty object of type LeRobotDataset
        obj = cls.__new__(cls)
        obj.repo_id = repo_id
        obj.root = root if root is not None else LEROBOT_HOME / repo_id
+        obj._version = CODEBASE_VERSION
+
+        obj.root.mkdir(exist_ok=True, parents=True)
+        obj.info = create_dataset_info(obj._version, fps, robot)
+        obj.write_info()
+        obj.fps = fps
+
        # obj.episodes = None
        # obj.image_transforms = None
        # obj.delta_timestamps = None
--- a/lerobot/common/datasets/utils.py
+++ b/lerobot/common/datasets/utils.py
@ -28,6 +28,13 @@ from huggingface_hub import DatasetCard, HfApi
 from PIL import Image as PILImage
 from torchvision import transforms

+from lerobot.common.robot_devices.robots.utils import Robot
+
+DEFAULT_CHUNK_SIZE = 1000  # Max number of episodes per chunk
+DEFAULT_VIDEO_PATH = "videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4"
+DEFAULT_PARQUET_PATH = (
+    "data/chunk-{episode_chunk:03d}/train-{episode_index:05d}-of-{total_episodes:05d}.parquet"
+)
 DATASET_CARD_TEMPLATE = """
 ---
 # Metadata will go there
@ -145,7 +152,7 @@ def load_hf_dataset(

 def load_metadata(local_dir: Path) -> tuple[dict | list]:
    """Loads metadata files from a dataset."""
-    info_path = local_dir / "meta/info.json"
+    info_path = local_dir / "meta/info.jsonl"
    episodes_path = local_dir / "meta/episodes.jsonl"
    stats_path = local_dir / "meta/stats.json"
    tasks_path = local_dir / "meta/tasks.json"
@ -159,8 +166,8 @@ def load_metadata(local_dir: Path) -> tuple[dict | list]:
    with open(stats_path) as f:
        stats = json.load(f)

-    with open(tasks_path) as f:
-        tasks = json.load(f)
+    with jsonlines.open(tasks_path, "r") as reader:
+        tasks = list(reader)

    stats = {key: torch.tensor(value) for key, value in flatten_dict(stats).items()}
    stats = unflatten_dict(stats)
@ -169,6 +176,28 @@ def load_metadata(local_dir: Path) -> tuple[dict | list]:
    return info, episode_dicts, stats, tasks


+def create_dataset_info(codebase_version: str, fps: int, robot: Robot) -> dict:
+    return {
+        "codebase_version": codebase_version,
+        "data_path": DEFAULT_PARQUET_PATH,
+        "robot_type": robot.robot_type,
+        "total_episodes": 0,
+        "total_frames": 0,
+        "total_tasks": 0,
+        "total_videos": 0,
+        "total_chunks": 0,
+        "chunks_size": DEFAULT_CHUNK_SIZE,
+        "fps": fps,
+        "splits": {},
+        # "keys": keys,
+        # "video_keys": video_keys,
+        # "image_keys": image_keys,
+        # "shapes": {**sequence_shapes, **video_shapes, **image_shapes},
+        # "names": names,
+        # "videos": {"videos_path": DEFAULT_VIDEO_PATH} if video_keys else None,
+    }
+
+
 def get_episode_data_index(episodes: list, episode_dicts: list[dict]) -> dict[str, torch.Tensor]:
    episode_lengths = {ep_idx: ep_dict["length"] for ep_idx, ep_dict in enumerate(episode_dicts)}
    if episodes is not None:
--- a/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py
+++ b/lerobot/common/datasets/v2/convert_dataset_v1_to_v2.py
@ -120,12 +120,15 @@ from huggingface_hub.errors import EntryNotFoundError
 from PIL import Image
 from safetensors.torch import load_file

-from lerobot.common.datasets.lerobot_dataset import (
+from lerobot.common.datasets.utils import (
    DEFAULT_CHUNK_SIZE,
    DEFAULT_PARQUET_PATH,
    DEFAULT_VIDEO_PATH,
+    create_branch,
+    flatten_dict,
+    get_hub_safe_version,
+    unflatten_dict,
 )
-from lerobot.common.datasets.utils import create_branch, flatten_dict, get_hub_safe_version, unflatten_dict
 from lerobot.common.utils.utils import init_hydra_config
 from lerobot.scripts.push_dataset_to_hub import push_dataset_card_to_hub

@ -607,8 +610,8 @@ def convert_dataset(
        raise ValueError

    assert set(tasks) == {task for ep_tasks in tasks_by_episodes.values() for task in ep_tasks}
-    task_json = [{"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)]
-    write_json(task_json, v20_dir / "meta" / "tasks.json")
+    tasks = [{"task_index": task_idx, "task": task} for task_idx, task in enumerate(tasks)]
+    write_jsonlines(tasks, v20_dir / "meta" / "tasks.json")

    # Shapes
    sequence_shapes = {key: dataset.features[key].length for key in keys["sequence"]}