Address comments

This commit is contained in:
Cadene 2024-05-01 23:56:11 +00:00
parent edcbd4005c
commit 63cf6fadb2
5 changed files with 10 additions and 265 deletions

View File

@ -45,8 +45,6 @@ In this benchmark, we focus on the loading time of random access, so we are not
- `2_frames`: 2 consecutive frames (e.g. `[t, t + 1 / fps]`),
- `2_frames_4_space`: 2 consecutive frames with 4 frames of spacing (e.g `[t, t + 4 / fps]`),
**Data augmentations**
We might revisit this benchmark and find better settings if we train our policies with various data augmentations to make them more robusts (e.g. robust to color changes, compression, etc.).

View File

@ -1,230 +0,0 @@
"""This file contains work-in-progress alternative to default decoding strategy."""
import einops
import torch
def decode_video_frames_ffmpegio(video_path, timestamps, device="cpu"):
# assert device == "cpu", f"Only CPU decoding is supported with ffmpegio, but device is {device}"
import einops
import ffmpegio
num_contiguous_frames = 1 # noqa: F841
image_format = "rgb24"
list_frames = []
for timestamp in timestamps:
kwargs = {
"ss": str(timestamp),
# vframes=num_contiguous_frames,
"pix_fmt": image_format,
# hwaccel=None if device == "cpu" else device, # ,
"show_log": True,
}
if device == "cuda":
kwargs["hwaccel_in"] = "cuda"
kwargs["hwaccel_output_format_in"] = "cuda"
fs, frames = ffmpegio.video.read(str(video_path), **kwargs)
list_frames.append(torch.from_numpy(frames))
frames = torch.cat(list_frames)
frames = einops.rearrange(frames, "b h w c -> b c h w")
frames = frames.type(torch.float32) / 255
return frames
def yuv_to_rgb(frames):
assert frames.dtype == torch.uint8
assert frames.ndim == 4
assert frames.shape[1] == 3
frames = frames.cpu().to(torch.float)
y = frames[..., 0, :, :]
u = frames[..., 1, :, :]
v = frames[..., 2, :, :]
y /= 255
u = u / 255 - 0.5
v = v / 255 - 0.5
r = y + 1.13983 * v
g = y + -0.39465 * u - 0.58060 * v
b = y + 2.03211 * u
rgb = torch.stack([r, g, b], 1)
rgb = (rgb * 255).clamp(0, 255).to(torch.uint8)
return rgb
def yuv_to_rgb_cv2(frames, return_hwc=True):
assert frames.dtype == torch.uint8
assert frames.ndim == 4
assert frames.shape[1] == 3
frames = frames.cpu()
import cv2
frames = einops.rearrange(frames, "b c h w -> b h w c")
frames = frames.numpy()
frames = [cv2.cvtColor(frame, cv2.COLOR_YUV2RGB) for frame in frames]
frames = [torch.from_numpy(frame) for frame in frames]
frames = torch.stack(frames)
if not return_hwc:
frames = einops.rearrange(frames, "b h w c -> b c h w")
return frames
def decode_video_frames_torchaudio(video_path, timestamps, device="cpu"):
num_contiguous_frames = 1
width = None
height = None
# image_format = "rgb" # or "yuv"
# image_format = None
image_format = "yuv444p"
# image_format = "yuv444p"
# image_format = "rgb24"
frame_rate = None
scale_full_range_filter = False
filter_desc = []
video_stream_kwgs = {
"frames_per_chunk": num_contiguous_frames,
# "buffer_chunk_size": num_contiguous_frames,
}
# choice of decoder
if device == "cuda":
video_stream_kwgs["hw_accel"] = "cuda:0"
video_stream_kwgs["decoder"] = "h264_cuvid"
# video_stream_kwgs["decoder"] = "hevc_cuvid"
# video_stream_kwgs["decoder"] = "av1_cuvid"
# video_stream_kwgs["decoder"] = "ffv1_cuvid"
else:
video_stream_kwgs["decoder"] = "h264"
# video_stream_kwgs["decoder"] = "hevc"
# video_stream_kwgs["decoder"] = "av1"
# video_stream_kwgs["decoder"] = "ffv1"
# resize
resize_width = width is not None
resize_height = height is not None
if resize_width or resize_height:
if device == "cuda":
assert resize_width and resize_height
video_stream_kwgs["decoder_option"] = {"resize": f"{width}x{height}"}
else:
scales = []
if resize_width:
scales.append(f"width={width}")
if resize_height:
scales.append(f"height={height}")
filter_desc.append(f"scale={':'.join(scales)}")
# choice of format
if image_format is not None:
if device == "cuda":
# TODO(rcadene): rebuild ffmpeg with --enable-cuda-nvcc, --enable-cuvid, and --enable-libnpp
# filter_desc.append(f"scale=format={image_format}")
# filter_desc.append(f"scale_cuda=format={image_format}")
# filter_desc.append(f"scale_npp=format={image_format}")
filter_desc.append(f"format=pix_fmts={image_format}")
else:
filter_desc.append(f"format=pix_fmts={image_format}")
# choice of frame rate
if frame_rate is not None:
filter_desc.append(f"fps={frame_rate}")
# to set output scale [0-255] instead of [16-235]
if scale_full_range_filter:
filter_desc.append("scale=in_range=limited:out_range=full")
if len(filter_desc) > 0:
video_stream_kwgs["filter_desc"] = ",".join(filter_desc)
# create a stream and load a certain number of frame at a certain frame rate
# TODO(rcadene): make sure it's the most optimal way to do it
from torchaudio.io import StreamReader
print(video_stream_kwgs)
list_frames = []
for timestamp in timestamps:
s = StreamReader(str(video_path))
s.seek(timestamp)
s.add_video_stream(**video_stream_kwgs)
s.fill_buffer()
(frames,) = s.pop_chunks()
if "yuv" in image_format:
frames = yuv_to_rgb(frames)
assert frames.dtype == torch.uint8
frames = frames.type(torch.float32)
# if device == "cuda":
# The original data had limited range, which is 16-235, and torchaudio does not convert,
# while FFmpeg converts it to full range 0-255. So you can apply a linear transformation.
if not scale_full_range_filter:
frames -= 16
frames *= 255 / (235 - 16)
frames /= 255
frames = frames.clip(0, 1)
list_frames.append(frames)
frames = torch.cat(list_frames)
return frames
# def _decode_frames_decord(video_path, timestamp):
# num_contiguous_frames = 1 # noqa: F841 TODO(rcadene): remove
# device = "cpu"
# from decord import VideoReader, cpu, gpu
# with open(str(video_path), "rb") as f:
# ctx = gpu if device == "cuda" else cpu
# vr = VideoReader(f, ctx=ctx(0)) # noqa: F841
# raise NotImplementedError("Convert `timestamp` into frame_id")
# # frame_id = frame_ids[0].item()
# # frames = vr.get_batch([frame_id])
# # frames = torch.from_numpy(frames.asnumpy())
# # frames = einops.rearrange(frames, "b h w c -> b c h w")
# # return frames
# def decode_frames_nvc(video_path, timestamps, device="cuda"):
# assert device == "cuda"
# import PyNvCodec as nvc
# import PytorchNvCodec as pnvc
# gpuID = 0
# nvDec = nvc.PyNvDecoder('path_to_video_file', gpuID)
# to_rgb = nvc.PySurfaceConverter(nvDec.Width(), nvDec.Height(), nvc.PixelFormat.NV12, nvc.PixelFormat.RGB, gpuID)
# to_planar = nvc.PySurfaceConverter(nvDec.Width(), nvDec.Height(), nvc.PixelFormat.RGB, nvc.PixelFormat.RGB_PLANAR, gpuID)
# while True:
# # Obtain NV12 decoded surface from decoder;
# rawSurface = nvDec.DecodeSingleSurface()
# if (rawSurface.Empty()):
# break
# # Convert to RGB interleaved;
# rgb_byte = to_rgb.Execute(rawSurface)
# # Convert to RGB planar because that's what to_tensor + normalize are doing;
# rgb_planar = to_planar.Execute(rgb_byte)
# # Create torch tensor from it and reshape because
# # pnvc.makefromDevicePtrUint8 creates just a chunk of CUDA memory
# # and then copies data from plane pointer to allocated chunk;
# surfPlane = rgb_planar.PlanePtr()
# surface_tensor = pnvc.makefromDevicePtrUint8(surfPlane.GpuMem(), surfPlane.Width(), surfPlane.Height(), surfPlane.Pitch(), surfPlane.ElemSize())
# surface_tensor.resize_(3, target_h, target_w)

View File

@ -11,10 +11,6 @@ import numpy
import PIL
import torch
from lerobot.common.datasets._video_benchmark._video_utils import (
decode_video_frames_ffmpegio,
decode_video_frames_torchaudio,
)
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
from lerobot.common.datasets.video_utils import (
decode_video_frames_torchvision,
@ -101,11 +97,7 @@ def run_video_benchmark(
decoder_kwgs = cfg["decoder_kwgs"]
device = cfg["device"]
if decoder == "torchaudio":
decode_frames_fn = decode_video_frames_torchaudio
elif decoder == "ffmpegio":
decode_frames_fn = decode_video_frames_ffmpegio
elif decoder == "torchvision":
if decoder == "torchvision":
decode_frames_fn = decode_video_frames_torchvision
else:
raise ValueError(decoder)
@ -231,6 +223,7 @@ def load_info(out_dir):
def main():
out_dir = Path("tmp/run_video_benchmark")
dry_run = False
repo_ids = ["lerobot/pusht", "lerobot/umi_cup_in_the_wild"]
timestamps_modes = [
@ -240,32 +233,11 @@ def main():
"6_frames",
]
for timestamps_mode in timestamps_modes:
bench_dir = Path(f"tmp/2024_05_01_{timestamps_mode}")
bench_dir = out_dir / timestamps_mode
print(f"### `{timestamps_mode}`")
print()
# print("**`decoder`**")
# headers = ["repo_id", "decoder", "load_time_factor", "avg_per_pixel_l2_error"]
# rows = []
# for repo_id in repo_ids:
# for decoder in ["torchvision", "ffmpegio", "torchaudio"]:
# cfg = {
# "repo_id": repo_id,
# # video encoding
# "pix_fmt": "yuv444p",
# # video decoding
# "device": "cpu",
# "decoder": decoder,
# "decoder_kwgs": {},
# }
# if not dry_run:
# run_video_benchmark(bench_dir / repo_id / decoder, cfg, timestamps_mode)
# info = load_info(bench_dir / repo_id / decoder)
# rows.append([repo_id, decoder, info["load_time_factor"], info["avg_per_pixel_l2_error"]])
# display_markdown_table(headers, rows)
print("**`pix_fmt`**")
headers = ["repo_id", "pix_fmt", "compression_factor", "load_time_factor", "avg_per_pixel_l2_error"]
rows = []

View File

@ -49,6 +49,9 @@ class LeRobotDataset(torch.utils.data.Dataset):
@property
def video(self) -> int:
"""Returns True if this dataset loads video frames from mp4 files.
Returns False if it only loads images from png files.
"""
return self.info.get("video", False)
@property

View File

@ -55,7 +55,7 @@ def decode_video_frames_torchvision(
Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
that key frame. As a consequence, to access a requested frame, we need to load the preceeding key frame,
that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
and all subsequent frames until reaching the requested frame. The number of key frames in a video
can be adjusted during encoding to take into account decoding time and video size in bytes.
"""
@ -73,7 +73,9 @@ def decode_video_frames_torchvision(
# torchvision.set_video_backend("video_reader")
# requires installing torchvision from source, see: https://github.com/pytorch/vision/blob/main/torchvision/csrc/io/decoder/gpu/README.rst
# check possible bug: https://github.com/pytorch/vision/issues/7745
raise NotImplementedError()
raise NotImplementedError(
"Video decoding on gpu with cuda is currently not supported. Use `device='cpu'`."
)
else:
raise ValueError(device)