Address comments

2024-05-01 23:56:11 +00:00 · 2024-05-01 23:56:11 +00:00 · 63cf6fadb2
parent edcbd4005c
commit 63cf6fadb2
5 changed files with 10 additions and 265 deletions
--- a/lerobot/common/datasets/_video_benchmark/README.md
+++ b/lerobot/common/datasets/_video_benchmark/README.md
@ -45,8 +45,6 @@ In this benchmark, we focus on the loading time of random access, so we are not
 - `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`),
 - `2_frames_4_space`: 2 consecutive frames with 4 frames of spacing (e.g `[t, t + 4 / fps]`),

-
-
 **Data augmentations**
 We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robusts (e.g. robust to color changes, compression, etc.).

--- a/lerobot/common/datasets/_video_benchmark/_video_utils.py
+++ b/lerobot/common/datasets/_video_benchmark/_video_utils.py
@ -1,230 +0,0 @@
-"""This file contains work-in-progress alternative to default decoding strategy."""
-
-import einops
-import torch
-
-
-def decode_video_frames_ffmpegio(video_path, timestamps, device="cpu"):
-    # assert device == "cpu", f"Only CPU decoding is supported with ffmpegio, but device is {device}"
-    import einops
-    import ffmpegio
-
-    num_contiguous_frames = 1  # noqa: F841
-    image_format = "rgb24"
-
-    list_frames = []
-    for timestamp in timestamps:
-        kwargs = {
-            "ss": str(timestamp),
-            # vframes=num_contiguous_frames,
-            "pix_fmt": image_format,
-            # hwaccel=None if device == "cpu" else device, # ,
-            "show_log": True,
-        }
-
-        if device == "cuda":
-            kwargs["hwaccel_in"] = "cuda"
-            kwargs["hwaccel_output_format_in"] = "cuda"
-
-        fs, frames = ffmpegio.video.read(str(video_path), **kwargs)
-        list_frames.append(torch.from_numpy(frames))
-    frames = torch.cat(list_frames)
-
-    frames = einops.rearrange(frames, "b h w c -> b c h w")
-    frames = frames.type(torch.float32) / 255
-    return frames
-
-
-def yuv_to_rgb(frames):
-    assert frames.dtype == torch.uint8
-    assert frames.ndim == 4
-    assert frames.shape[1] == 3
-
-    frames = frames.cpu().to(torch.float)
-    y = frames[..., 0, :, :]
-    u = frames[..., 1, :, :]
-    v = frames[..., 2, :, :]
-
-    y /= 255
-    u = u / 255 - 0.5
-    v = v / 255 - 0.5
-
-    r = y + 1.13983 * v
-    g = y + -0.39465 * u - 0.58060 * v
-    b = y + 2.03211 * u
-
-    rgb = torch.stack([r, g, b], 1)
-    rgb = (rgb * 255).clamp(0, 255).to(torch.uint8)
-    return rgb
-
-
-def yuv_to_rgb_cv2(frames, return_hwc=True):
-    assert frames.dtype == torch.uint8
-    assert frames.ndim == 4
-    assert frames.shape[1] == 3
-    frames = frames.cpu()
-    import cv2
-
-    frames = einops.rearrange(frames, "b c h w -> b h w c")
-    frames = frames.numpy()
-    frames = [cv2.cvtColor(frame, cv2.COLOR_YUV2RGB) for frame in frames]
-    frames = [torch.from_numpy(frame) for frame in frames]
-    frames = torch.stack(frames)
-    if not return_hwc:
-        frames = einops.rearrange(frames, "b h w c -> b c h w")
-    return frames
-
-
-def decode_video_frames_torchaudio(video_path, timestamps, device="cpu"):
-    num_contiguous_frames = 1
-    width = None
-    height = None
-    # image_format = "rgb"  # or "yuv"
-    # image_format = None
-    image_format = "yuv444p"
-    # image_format = "yuv444p"
-    # image_format = "rgb24"
-    frame_rate = None
-
-    scale_full_range_filter = False
-
-    filter_desc = []
-
-    video_stream_kwgs = {
-        "frames_per_chunk": num_contiguous_frames,
-        # "buffer_chunk_size": num_contiguous_frames,
-    }
-
-    # choice of decoder
-    if device == "cuda":
-        video_stream_kwgs["hw_accel"] = "cuda:0"
-        video_stream_kwgs["decoder"] = "h264_cuvid"
-        # video_stream_kwgs["decoder"] = "hevc_cuvid"
-        # video_stream_kwgs["decoder"] = "av1_cuvid"
-        # video_stream_kwgs["decoder"] = "ffv1_cuvid"
-    else:
-        video_stream_kwgs["decoder"] = "h264"
-        # video_stream_kwgs["decoder"] = "hevc"
-        # video_stream_kwgs["decoder"] = "av1"
-        # video_stream_kwgs["decoder"] = "ffv1"
-
-    # resize
-    resize_width = width is not None
-    resize_height = height is not None
-    if resize_width or resize_height:
-        if device == "cuda":
-            assert resize_width and resize_height
-            video_stream_kwgs["decoder_option"] = {"resize": f"{width}x{height}"}
-        else:
-            scales = []
-            if resize_width:
-                scales.append(f"width={width}")
-            if resize_height:
-                scales.append(f"height={height}")
-            filter_desc.append(f"scale={':'.join(scales)}")
-
-    # choice of format
-    if image_format is not None:
-        if device == "cuda":
-            # TODO(rcadene): rebuild ffmpeg with --enable-cuda-nvcc, --enable-cuvid, and --enable-libnpp
-            # filter_desc.append(f"scale=format={image_format}")
-            # filter_desc.append(f"scale_cuda=format={image_format}")
-            # filter_desc.append(f"scale_npp=format={image_format}")
-            filter_desc.append(f"format=pix_fmts={image_format}")
-        else:
-            filter_desc.append(f"format=pix_fmts={image_format}")
-
-    # choice of frame rate
-    if frame_rate is not None:
-        filter_desc.append(f"fps={frame_rate}")
-
-    # to set output scale [0-255] instead of [16-235]
-    if scale_full_range_filter:
-        filter_desc.append("scale=in_range=limited:out_range=full")
-
-    if len(filter_desc) > 0:
-        video_stream_kwgs["filter_desc"] = ",".join(filter_desc)
-
-    # create a stream and load a certain number of frame at a certain frame rate
-    # TODO(rcadene): make sure it's the most optimal way to do it
-    from torchaudio.io import StreamReader
-
-    print(video_stream_kwgs)
-
-    list_frames = []
-    for timestamp in timestamps:
-        s = StreamReader(str(video_path))
-        s.seek(timestamp)
-        s.add_video_stream(**video_stream_kwgs)
-        s.fill_buffer()
-        (frames,) = s.pop_chunks()
-
-        if "yuv" in image_format:
-            frames = yuv_to_rgb(frames)
-
-        assert frames.dtype == torch.uint8
-        frames = frames.type(torch.float32)
-
-        # if device == "cuda":
-        # The original data had limited range, which is 16-235, and torchaudio does not convert,
-        # while FFmpeg converts it to full range 0-255. So you can apply a linear transformation.
-        if not scale_full_range_filter:
-            frames -= 16
-            frames *= 255 / (235 - 16)
-
-        frames /= 255
-
-        frames = frames.clip(0, 1)
-        list_frames.append(frames)
-
-    frames = torch.cat(list_frames)
-    return frames
-
-
-# def _decode_frames_decord(video_path, timestamp):
-#     num_contiguous_frames = 1  # noqa: F841 TODO(rcadene): remove
-#     device = "cpu"
-
-#     from decord import VideoReader, cpu, gpu
-
-#     with open(str(video_path), "rb") as f:
-#         ctx = gpu if device == "cuda" else cpu
-#         vr = VideoReader(f, ctx=ctx(0))  # noqa: F841
-#         raise NotImplementedError("Convert `timestamp` into frame_id")
-#     #     frame_id = frame_ids[0].item()
-#     #     frames = vr.get_batch([frame_id])
-#     # frames = torch.from_numpy(frames.asnumpy())
-#     # frames = einops.rearrange(frames, "b h w c -> b c h w")
-#     # return frames
-
-
-# def decode_frames_nvc(video_path, timestamps, device="cuda"):
-#     assert device == "cuda"
-
-#     import PyNvCodec as nvc
-#     import PytorchNvCodec as pnvc
-
-#     gpuID = 0
-
-#     nvDec = nvc.PyNvDecoder('path_to_video_file', gpuID)
-#     to_rgb = nvc.PySurfaceConverter(nvDec.Width(), nvDec.Height(), nvc.PixelFormat.NV12, nvc.PixelFormat.RGB, gpuID)
-#     to_planar = nvc.PySurfaceConverter(nvDec.Width(), nvDec.Height(), nvc.PixelFormat.RGB, nvc.PixelFormat.RGB_PLANAR, gpuID)
-
-#     while True:
-#         # Obtain NV12 decoded surface from decoder;
-#         rawSurface = nvDec.DecodeSingleSurface()
-#         if (rawSurface.Empty()):
-#             break
-
-#         # Convert to RGB interleaved;
-#         rgb_byte = to_rgb.Execute(rawSurface)
-
-#         # Convert to RGB planar because that's what to_tensor + normalize are doing;
-#         rgb_planar = to_planar.Execute(rgb_byte)
-
-#         # Create torch tensor from it and reshape because
-#         # pnvc.makefromDevicePtrUint8 creates just a chunk of CUDA memory
-#         # and then copies data from plane pointer to allocated chunk;
-#         surfPlane = rgb_planar.PlanePtr()
-#         surface_tensor = pnvc.makefromDevicePtrUint8(surfPlane.GpuMem(), surfPlane.Width(), surfPlane.Height(), surfPlane.Pitch(), surfPlane.ElemSize())
-#         surface_tensor.resize_(3, target_h, target_w)
--- a/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py
+++ b/lerobot/common/datasets/_video_benchmark/run_video_benchmark.py
@ -11,10 +11,6 @@ import numpy
 import PIL
 import torch

-from lerobot.common.datasets._video_benchmark._video_utils import (
-    decode_video_frames_ffmpegio,
-    decode_video_frames_torchaudio,
-)
 from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
 from lerobot.common.datasets.video_utils import (
    decode_video_frames_torchvision,
@ -101,11 +97,7 @@ def run_video_benchmark(
    decoder_kwgs = cfg["decoder_kwgs"]
    device = cfg["device"]

-    if decoder == "torchaudio":
-        decode_frames_fn = decode_video_frames_torchaudio
-    elif decoder == "ffmpegio":
-        decode_frames_fn = decode_video_frames_ffmpegio
-    elif decoder == "torchvision":
+    if decoder == "torchvision":
        decode_frames_fn = decode_video_frames_torchvision
    else:
        raise ValueError(decoder)
@ -231,6 +223,7 @@ def load_info(out_dir):


 def main():
+    out_dir = Path("tmp/run_video_benchmark")
    dry_run = False
    repo_ids = ["lerobot/pusht", "lerobot/umi_cup_in_the_wild"]
    timestamps_modes = [
@ -240,32 +233,11 @@ def main():
        "6_frames",
    ]
    for timestamps_mode in timestamps_modes:
-        bench_dir = Path(f"tmp/2024_05_01_{timestamps_mode}")
+        bench_dir = out_dir / timestamps_mode

        print(f"### `{timestamps_mode}`")
        print()

-        # print("**`decoder`**")
-        # headers = ["repo_id", "decoder", "load_time_factor", "avg_per_pixel_l2_error"]
-        # rows = []
-        # for repo_id in repo_ids:
-        #     for decoder in ["torchvision", "ffmpegio", "torchaudio"]:
-        #         cfg = {
-        #             "repo_id": repo_id,
-        #             # video encoding
-        #             "pix_fmt": "yuv444p",
-        #             # video decoding
-        #             "device": "cpu",
-        #             "decoder": decoder,
-        #             "decoder_kwgs": {},
-        #         }
-
-        #         if not dry_run:
-        #             run_video_benchmark(bench_dir / repo_id / decoder, cfg, timestamps_mode)
-        #         info = load_info(bench_dir / repo_id / decoder)
-        #         rows.append([repo_id, decoder, info["load_time_factor"], info["avg_per_pixel_l2_error"]])
-        # display_markdown_table(headers, rows)
-
        print("**`pix_fmt`**")
        headers = ["repo_id", "pix_fmt", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
        rows = []
--- a/lerobot/common/datasets/lerobot_dataset.py
+++ b/lerobot/common/datasets/lerobot_dataset.py
@ -49,6 +49,9 @@ class LeRobotDataset(torch.utils.data.Dataset):

    @property
    def video(self) -> int:
+        """Returns True if this dataset loads video frames from mp4 files.
+        Returns False if it only loads images from png files.
+        """
        return self.info.get("video", False)

    @property
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@ -55,7 +55,7 @@ def decode_video_frames_torchvision(

    Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
    the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
-    that key frame. As a consequence, to access a requested frame, we need to load the preceeding key frame,
+    that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
    and all subsequent frames until reaching the requested frame. The number of key frames in a video
    can be adjusted during encoding to take into account decoding time and video size in bytes.
    """
@ -73,7 +73,9 @@ def decode_video_frames_torchvision(
        # torchvision.set_video_backend("video_reader")
        # requires installing torchvision from source, see: https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
        # check possible bug: https://github.com/pytorch/vision/issues/7745
-        raise NotImplementedError()
+        raise NotImplementedError(
+            "Video decoding on gpu with cuda is currently not supported. Use `device='cpu'`."
+        )
    else:
        raise ValueError(device)