diff --git a/lerobot/common/datasets/_video_benchmark/README.md b/lerobot/common/datasets/_video_benchmark/README.md index 95636d21..10e8d12f 100644 --- a/lerobot/common/datasets/_video_benchmark/README.md +++ b/lerobot/common/datasets/_video_benchmark/README.md @@ -45,8 +45,6 @@ In this benchmark, we focus on the loading time of random access, so we are not - `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`), - `2_frames_4_space`: 2 consecutive frames with 4 frames of spacing (e.g `[t, t + 4 / fps]`), - - **Data augmentations** We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robusts (e.g. robust to color changes, compression, etc.). diff --git a/lerobot/common/datasets/_video_benchmark/_video_utils.py b/lerobot/common/datasets/_video_benchmark/_video_utils.py deleted file mode 100644 index 59247fb0..00000000 --- a/lerobot/common/datasets/_video_benchmark/_video_utils.py +++ /dev/null @@ -1,230 +0,0 @@ -"""This file contains work-in-progress alternative to default decoding strategy.""" - -import einops -import torch - - -def decode_video_frames_ffmpegio(video_path, timestamps, device="cpu"): - # assert device == "cpu", f"Only CPU decoding is supported with ffmpegio, but device is {device}" - import einops - import ffmpegio - - num_contiguous_frames = 1 # noqa: F841 - image_format = "rgb24" - - list_frames = [] - for timestamp in timestamps: - kwargs = { - "ss": str(timestamp), - # vframes=num_contiguous_frames, - "pix_fmt": image_format, - # hwaccel=None if device == "cpu" else device, # , - "show_log": True, - } - - if device == "cuda": - kwargs["hwaccel_in"] = "cuda" - kwargs["hwaccel_output_format_in"] = "cuda" - - fs, frames = ffmpegio.video.read(str(video_path), **kwargs) - list_frames.append(torch.from_numpy(frames)) - frames = torch.cat(list_frames) - - frames = einops.rearrange(frames, "b h w c -> b c h w") - frames = frames.type(torch.float32) / 255 - return frames - - -def yuv_to_rgb(frames): - assert frames.dtype == torch.uint8 - assert frames.ndim == 4 - assert frames.shape[1] == 3 - - frames = frames.cpu().to(torch.float) - y = frames[..., 0, :, :] - u = frames[..., 1, :, :] - v = frames[..., 2, :, :] - - y /= 255 - u = u / 255 - 0.5 - v = v / 255 - 0.5 - - r = y + 1.13983 * v - g = y + -0.39465 * u - 0.58060 * v - b = y + 2.03211 * u - - rgb = torch.stack([r, g, b], 1) - rgb = (rgb * 255).clamp(0, 255).to(torch.uint8) - return rgb - - -def yuv_to_rgb_cv2(frames, return_hwc=True): - assert frames.dtype == torch.uint8 - assert frames.ndim == 4 - assert frames.shape[1] == 3 - frames = frames.cpu() - import cv2 - - frames = einops.rearrange(frames, "b c h w -> b h w c") - frames = frames.numpy() - frames = [cv2.cvtColor(frame, cv2.COLOR_YUV2RGB) for frame in frames] - frames = [torch.from_numpy(frame) for frame in frames] - frames = torch.stack(frames) - if not return_hwc: - frames = einops.rearrange(frames, "b h w c -> b c h w") - return frames - - -def decode_video_frames_torchaudio(video_path, timestamps, device="cpu"): - num_contiguous_frames = 1 - width = None - height = None - # image_format = "rgb" # or "yuv" - # image_format = None - image_format = "yuv444p" - # image_format = "yuv444p" - # image_format = "rgb24" - frame_rate = None - - scale_full_range_filter = False - - filter_desc = [] - - video_stream_kwgs = { - "frames_per_chunk": num_contiguous_frames, - # "buffer_chunk_size": num_contiguous_frames, - } - - # choice of decoder - if device == "cuda": - video_stream_kwgs["hw_accel"] = "cuda:0" - video_stream_kwgs["decoder"] = "h264_cuvid" - # video_stream_kwgs["decoder"] = "hevc_cuvid" - # video_stream_kwgs["decoder"] = "av1_cuvid" - # video_stream_kwgs["decoder"] = "ffv1_cuvid" - else: - video_stream_kwgs["decoder"] = "h264" - # video_stream_kwgs["decoder"] = "hevc" - # video_stream_kwgs["decoder"] = "av1" - # video_stream_kwgs["decoder"] = "ffv1" - - # resize - resize_width = width is not None - resize_height = height is not None - if resize_width or resize_height: - if device == "cuda": - assert resize_width and resize_height - video_stream_kwgs["decoder_option"] = {"resize": f"{width}x{height}"} - else: - scales = [] - if resize_width: - scales.append(f"width={width}") - if resize_height: - scales.append(f"height={height}") - filter_desc.append(f"scale={':'.join(scales)}") - - # choice of format - if image_format is not None: - if device == "cuda": - # TODO(rcadene): rebuild ffmpeg with --enable-cuda-nvcc, --enable-cuvid, and --enable-libnpp - # filter_desc.append(f"scale=format={image_format}") - # filter_desc.append(f"scale_cuda=format={image_format}") - # filter_desc.append(f"scale_npp=format={image_format}") - filter_desc.append(f"format=pix_fmts={image_format}") - else: - filter_desc.append(f"format=pix_fmts={image_format}") - - # choice of frame rate - if frame_rate is not None: - filter_desc.append(f"fps={frame_rate}") - - # to set output scale [0-255] instead of [16-235] - if scale_full_range_filter: - filter_desc.append("scale=in_range=limited:out_range=full") - - if len(filter_desc) > 0: - video_stream_kwgs["filter_desc"] = ",".join(filter_desc) - - # create a stream and load a certain number of frame at a certain frame rate - # TODO(rcadene): make sure it's the most optimal way to do it - from torchaudio.io import StreamReader - - print(video_stream_kwgs) - - list_frames = [] - for timestamp in timestamps: - s = StreamReader(str(video_path)) - s.seek(timestamp) - s.add_video_stream(**video_stream_kwgs) - s.fill_buffer() - (frames,) = s.pop_chunks() - - if "yuv" in image_format: - frames = yuv_to_rgb(frames) - - assert frames.dtype == torch.uint8 - frames = frames.type(torch.float32) - - # if device == "cuda": - # The original data had limited range, which is 16-235, and torchaudio does not convert, - # while FFmpeg converts it to full range 0-255. So you can apply a linear transformation. - if not scale_full_range_filter: - frames -= 16 - frames *= 255 / (235 - 16) - - frames /= 255 - - frames = frames.clip(0, 1) - list_frames.append(frames) - - frames = torch.cat(list_frames) - return frames - - -# def _decode_frames_decord(video_path, timestamp): -# num_contiguous_frames = 1 # noqa: F841 TODO(rcadene): remove -# device = "cpu" - -# from decord import VideoReader, cpu, gpu - -# with open(str(video_path), "rb") as f: -# ctx = gpu if device == "cuda" else cpu -# vr = VideoReader(f, ctx=ctx(0)) # noqa: F841 -# raise NotImplementedError("Convert `timestamp` into frame_id") -# # frame_id = frame_ids[0].item() -# # frames = vr.get_batch([frame_id]) -# # frames = torch.from_numpy(frames.asnumpy()) -# # frames = einops.rearrange(frames, "b h w c -> b c h w") -# # return frames - - -# def decode_frames_nvc(video_path, timestamps, device="cuda"): -# assert device == "cuda" - -# import PyNvCodec as nvc -# import PytorchNvCodec as pnvc - -# gpuID = 0 - -# nvDec = nvc.PyNvDecoder('path_to_video_file', gpuID) -# to_rgb = nvc.PySurfaceConverter(nvDec.Width(), nvDec.Height(), nvc.PixelFormat.NV12, nvc.PixelFormat.RGB, gpuID) -# to_planar = nvc.PySurfaceConverter(nvDec.Width(), nvDec.Height(), nvc.PixelFormat.RGB, nvc.PixelFormat.RGB_PLANAR, gpuID) - -# while True: -# # Obtain NV12 decoded surface from decoder; -# rawSurface = nvDec.DecodeSingleSurface() -# if (rawSurface.Empty()): -# break - -# # Convert to RGB interleaved; -# rgb_byte = to_rgb.Execute(rawSurface) - -# # Convert to RGB planar because that's what to_tensor + normalize are doing; -# rgb_planar = to_planar.Execute(rgb_byte) - -# # Create torch tensor from it and reshape because -# # pnvc.makefromDevicePtrUint8 creates just a chunk of CUDA memory -# # and then copies data from plane pointer to allocated chunk; -# surfPlane = rgb_planar.PlanePtr() -# surface_tensor = pnvc.makefromDevicePtrUint8(surfPlane.GpuMem(), surfPlane.Width(), surfPlane.Height(), surfPlane.Pitch(), surfPlane.ElemSize()) -# surface_tensor.resize_(3, target_h, target_w) diff --git a/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py b/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py index 9c687a36..b6e83a0c 100644 --- a/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py +++ b/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py @@ -11,10 +11,6 @@ import numpy import PIL import torch -from lerobot.common.datasets._video_benchmark._video_utils import ( - decode_video_frames_ffmpegio, - decode_video_frames_torchaudio, -) from lerobot.common.datasets.lerobot_dataset import LeRobotDataset from lerobot.common.datasets.video_utils import ( decode_video_frames_torchvision, @@ -101,11 +97,7 @@ def run_video_benchmark( decoder_kwgs = cfg["decoder_kwgs"] device = cfg["device"] - if decoder == "torchaudio": - decode_frames_fn = decode_video_frames_torchaudio - elif decoder == "ffmpegio": - decode_frames_fn = decode_video_frames_ffmpegio - elif decoder == "torchvision": + if decoder == "torchvision": decode_frames_fn = decode_video_frames_torchvision else: raise ValueError(decoder) @@ -231,6 +223,7 @@ def load_info(out_dir): def main(): + out_dir = Path("tmp/run_video_benchmark") dry_run = False repo_ids = ["lerobot/pusht", "lerobot/umi_cup_in_the_wild"] timestamps_modes = [ @@ -240,32 +233,11 @@ def main(): "6_frames", ] for timestamps_mode in timestamps_modes: - bench_dir = Path(f"tmp/2024_05_01_{timestamps_mode}") + bench_dir = out_dir / timestamps_mode print(f"### `{timestamps_mode}`") print() - # print("**`decoder`**") - # headers = ["repo_id", "decoder", "load_time_factor", "avg_per_pixel_l2_error"] - # rows = [] - # for repo_id in repo_ids: - # for decoder in ["torchvision", "ffmpegio", "torchaudio"]: - # cfg = { - # "repo_id": repo_id, - # # video encoding - # "pix_fmt": "yuv444p", - # # video decoding - # "device": "cpu", - # "decoder": decoder, - # "decoder_kwgs": {}, - # } - - # if not dry_run: - # run_video_benchmark(bench_dir / repo_id / decoder, cfg, timestamps_mode) - # info = load_info(bench_dir / repo_id / decoder) - # rows.append([repo_id, decoder, info["load_time_factor"], info["avg_per_pixel_l2_error"]]) - # display_markdown_table(headers, rows) - print("**`pix_fmt`**") headers = ["repo_id", "pix_fmt", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"] rows = [] diff --git a/lerobot/common/datasets/lerobot_dataset.py b/lerobot/common/datasets/lerobot_dataset.py index 220ea39b..faa19ac4 100644 --- a/lerobot/common/datasets/lerobot_dataset.py +++ b/lerobot/common/datasets/lerobot_dataset.py @@ -49,6 +49,9 @@ class LeRobotDataset(torch.utils.data.Dataset): @property def video(self) -> int: + """Returns True if this dataset loads video frames from mp4 files. + Returns False if it only loads images from png files. + """ return self.info.get("video", False) @property diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index 3861f574..294f39f9 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -55,7 +55,7 @@ def decode_video_frames_torchvision( Note: Video benefits from inter-frame compression. Instead of storing every frame individually, the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to - that key frame. As a consequence, to access a requested frame, we need to load the preceeding key frame, + that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame, and all subsequent frames until reaching the requested frame. The number of key frames in a video can be adjusted during encoding to take into account decoding time and video size in bytes. """ @@ -73,7 +73,9 @@ def decode_video_frames_torchvision( # torchvision.set_video_backend("video_reader") # requires installing torchvision from source, see: https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst # check possible bug: https://github.com/pytorch/vision/issues/7745 - raise NotImplementedError() + raise NotImplementedError( + "Video decoding on gpu with cuda is currently not supported. Use `device='cpu'`." + ) else: raise ValueError(device)