add new video decoder method

2025-02-20 21:13:49 +01:00 · 2025-02-20 21:13:49 +01:00 · cae49528ee
parent c6bcfb3539
commit cae49528ee
2 changed files with 96 additions and 5 deletions
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@ -652,6 +652,45 @@ class LeRobotDataset(torch.utils.data.Dataset):
        item = self.hf_dataset[idx]
        ep_idx = item["episode_index"].item()
        query_indices = None
        if self.delta_indices is not None:
            current_ep_idx = self.episodes.index(ep_idx) if self.episodes is not None else ep_idx
            query_indices, padding = self._get_query_indices(idx, current_ep_idx)
            query_result = self._query_hf_dataset(query_indices)
            item = {**item, **padding}
            for key, val in query_result.items():
                item[key] = val
        if len(self.meta.video_keys) > 0:
            current_ts = item["timestamp"].item()
            query_timestamps = self._get_query_timestamps(current_ts, query_indices)
            # if what is returned is all the info that i used query_timestamps, episode
            # percentage of chance, 30% cpu, gpu
            # video_frames = self._query_videos(query_timestamps, ep_idx)
            # item = {**video_frames, **item}
            # jade - instead of decoding video, return video path & timestamps
            # hack only add metadata
            item["video_paths"] = {
                vid_key: self.root / self.meta.get_video_file_path(ep_idx, vid_key)
                for vid_key in query_timestamps.keys()
            }
            item["query_timestamps"] = query_timestamps
        if self.image_transforms is not None:
            breakpoint()
            image_keys = self.meta.camera_keys
            for cam in image_keys:
                item[cam] = self.image_transforms(item[cam])
        # Add task as a string
        task_idx = item["task_index"].item()
        item["task"] = self.meta.tasks[task_idx]
        return item
    def __getitem2__(self, idx) -> dict:
        item = self.hf_dataset[idx]
        ep_idx = item["episode_index"].item()
        query_indices = None
        if self.delta_indices is not None:
            current_ep_idx = self.episodes.index(ep_idx) if self.episodes is not None else ep_idx
@ -677,7 +716,6 @@ class LeRobotDataset(torch.utils.data.Dataset):
        item["task"] = self.meta.tasks[task_idx]
        return item
    def __repr__(self):
        feature_keys = list(self.features)
        return (
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@ -23,7 +23,7 @@ import torch
 from termcolor import colored
 from torch.amp import GradScaler
 from torch.optim import Optimizer
-
+from pathlib import Path
 from lerobot.common.datasets.factory import make_dataset
 from lerobot.common.datasets.sampler import EpisodeAwareSampler
 from lerobot.common.datasets.utils import cycle
@ -51,6 +51,60 @@ from lerobot.common.utils.wandb_utils import WandBLogger
 from lerobot.configs import parser
 from lerobot.configs.train import TrainPipelineConfig
 from lerobot.scripts.eval import eval_policy
 from lerobot.common.datasets.video_utils import (
    decode_video_frames_torchvision
 )
 # let's define a custom fn
 from torchcodec.decoders import VideoDecoder
 def custom_collate_fn(batch):
    """
    Custom collate function that decodes videos on CPU.
    Ensures batch format remains unchanged.
    """
    batched_frames = {}  # Dictionary to hold video tensors
    final_batch = {}  # Dictionary to hold the rest of the batch
    # Initialize final_batch with all original keys (except video paths)
    for key in batch[0].keys():
        if key not in ["video_paths", "query_timestamps"]:  # Skip video-related fields
            final_batch[key] = [item[key] for item in batch]
    # Process video decoding
    for item in batch:
        if "video_paths" in item and "query_timestamps" in item:
            for vid_key, video_path in item["video_paths"].items():
                decoder = VideoDecoder(str(video_path), device="cpu")  # CPU decoding
                # frames = decoder.get_frames_played_at(item["query_timestamps"][vid_key]).data.float() / 255
                timestamps = item["query_timestamps"][vid_key]
                frames = decode_video_frames_torchvision(
                    video_path=Path(video_path),
                    timestamps=timestamps,
                    tolerance_s=0.02,  # Adjust tolerance if needed
                    backend="pyav",  # Default backend (modify if needed)
                    log_loaded_timestamps=False,
                )
                if vid_key not in batched_frames:
                    batched_frames[vid_key] = []
                batched_frames[vid_key].append(frames)
    # Convert lists to tensors where possible
    for key in batched_frames:
        batched_frames[key] = torch.stack(batched_frames[key])  # Stack tensors
    for key in final_batch:
        if isinstance(final_batch[key][0], torch.Tensor):
            final_batch[key] = torch.stack(final_batch[key])
    # **Fix: Ensure video_frames is a single tensor instead of a dictionary**
    # hard coded this must change
    if len(batched_frames) == 1:
        final_batch["observation.images.top"] = list(batched_frames.values())[0]  # Direct tensor
    else:
        final_batch["observation.images.top"] = batched_frames  # Keep dict if multiple
    return final_batch
 def update_policy(
@ -182,12 +236,11 @@ def train(cfg: TrainPipelineConfig):
        shuffle=shuffle,
        sampler=sampler,
        pin_memory=device.type != "cpu",
        collate_fn=custom_collate_fn,
        drop_last=False,
    )
    dl_iter = cycle(dataloader)
    policy.train()
    train_metrics = {
        "loss": AverageMeter("loss", ":.3f"),
        "grad_norm": AverageMeter("grdn", ":.3f"),
@ -205,7 +258,6 @@ def train(cfg: TrainPipelineConfig):
        start_time = time.perf_counter()
        batch = next(dl_iter)
        train_tracker.dataloading_s = time.perf_counter() - start_time
        for key in batch:
            if isinstance(batch[key], torch.Tensor):
                batch[key] = batch[key].to(device, non_blocking=True)
@ -231,6 +283,7 @@ def train(cfg: TrainPipelineConfig):
        if is_log_step:
            logging.info(train_tracker)
            breakpoint()
            if wandb_logger:
                wandb_log_dict = train_tracker.to_dict()
                if output_dict: