diff --git a/benchmarks/video/run_video_benchmark.py b/benchmarks/video/run_video_benchmark.py index e9066487..92b9df9e 100644 --- a/benchmarks/video/run_video_benchmark.py +++ b/benchmarks/video/run_video_benchmark.py @@ -39,6 +39,7 @@ from lerobot.common.datasets.lerobot_dataset import LeRobotDataset from lerobot.common.datasets.video_utils import ( decode_video_frames_torchvision, encode_video_frames, + decode_video_frames_torchcodec, ) from lerobot.common.utils.benchmark import TimeBenchmark @@ -67,10 +68,6 @@ def parse_int_or_none(value) -> int | None: def check_datasets_formats(repo_ids: list) -> None: for repo_id in repo_ids: dataset = LeRobotDataset(repo_id) - if dataset.video: - raise ValueError( - f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}" - ) def get_directory_size(directory: Path) -> int: @@ -155,6 +152,10 @@ def decode_video_frames( ) -> torch.Tensor: if backend in ["pyav", "video_reader"]: return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend) + elif backend in ["torchcodec-cpu", "torchcodec-gpu"]: + # Only pass device once depending on the backend + device = "cpu" if backend == "torchcodec-cpu" else "cuda" + return decode_video_frames_torchcodec(video_path, timestamps, tolerance_s, device=device) else: raise NotImplementedError(backend) @@ -188,7 +189,7 @@ def benchmark_decoding( original_frames = load_original_frames(imgs_dir, timestamps, fps) result["load_time_images_ms"] = time_benchmark.result_ms / num_frames - frames_np, original_frames_np = frames.numpy(), original_frames.numpy() + frames_np, original_frames_np = frames.cpu().numpy(), original_frames.cpu().numpy() for i in range(num_frames): result["mse_values"].append(mean_squared_error(original_frames_np[i], frames_np[i])) result["psnr_values"].append( diff --git a/lerobot/scripts/train.py b/lerobot/scripts/train.py index 2b6f7d85..59744c7e 100644 --- a/lerobot/scripts/train.py +++ b/lerobot/scripts/train.py @@ -52,10 +52,9 @@ from lerobot.configs import parser from lerobot.configs.train import TrainPipelineConfig from lerobot.scripts.eval import eval_policy from lerobot.common.datasets.video_utils import ( - decode_video_frames_torchvision + decode_video_frames_torchvision, decode_video_frames_torchcodec ) # let's define a custom fn -from torchcodec.decoders import VideoDecoder def custom_collate_fn(batch): """ @@ -81,7 +80,7 @@ def custom_collate_fn(batch): ) # stack frames for this video key and add directly to the item - item[vid_key] = torch.stack(frames) + item[vid_key] = frames # add item data (both video and non-video) to final_batch for key, value in item.items():