491 lines
17 KiB
Python
491 lines
17 KiB
Python
#!/usr/bin/env python
|
|
|
|
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Assess the performance of video decoding in various configurations.
|
|
|
|
This script will benchmark different video encoding and decoding parameters.
|
|
See the provided README.md or run `python benchmark/video/run_video_benchmark.py --help` for usage info.
|
|
"""
|
|
|
|
import argparse
|
|
import datetime as dt
|
|
import random
|
|
import shutil
|
|
from collections import OrderedDict
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from pathlib import Path
|
|
|
|
import einops
|
|
import numpy as np
|
|
import pandas as pd
|
|
import PIL
|
|
import torch
|
|
from skimage.metrics import mean_squared_error, peak_signal_noise_ratio, structural_similarity
|
|
from tqdm import tqdm
|
|
|
|
from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
|
|
from lerobot.common.datasets.video_utils import (
|
|
decode_video_frames_torchvision,
|
|
encode_video_frames,
|
|
)
|
|
from lerobot.common.utils.benchmark import TimeBenchmark
|
|
|
|
BASE_ENCODING = OrderedDict(
|
|
[
|
|
("vcodec", "libx264"),
|
|
("pix_fmt", "yuv444p"),
|
|
("g", 2),
|
|
("crf", None),
|
|
# TODO(aliberts): Add fastdecode
|
|
# ("fastdecode", 0),
|
|
]
|
|
)
|
|
|
|
|
|
# TODO(rcadene, aliberts): move to `utils.py` folder when we want to refactor
|
|
def parse_int_or_none(value) -> int | None:
|
|
if value.lower() == "none":
|
|
return None
|
|
try:
|
|
return int(value)
|
|
except ValueError as e:
|
|
raise argparse.ArgumentTypeError(f"Invalid int or None: {value}") from e
|
|
|
|
|
|
def check_datasets_formats(repo_ids: list) -> None:
|
|
for repo_id in repo_ids:
|
|
dataset = LeRobotDataset(repo_id)
|
|
if len(dataset.meta.video_keys) > 0:
|
|
raise ValueError(
|
|
f"Use only image dataset for running this benchmark. Video dataset provided: {repo_id}"
|
|
)
|
|
|
|
|
|
def get_directory_size(directory: Path) -> int:
|
|
total_size = 0
|
|
for item in directory.rglob("*"):
|
|
if item.is_file():
|
|
total_size += item.stat().st_size
|
|
return total_size
|
|
|
|
|
|
def load_original_frames(imgs_dir: Path, timestamps: list[float], fps: int) -> torch.Tensor:
|
|
frames = []
|
|
for ts in timestamps:
|
|
idx = int(ts * fps)
|
|
frame = PIL.Image.open(imgs_dir / f"frame_{idx:06d}.png")
|
|
frame = torch.from_numpy(np.array(frame))
|
|
frame = frame.type(torch.float32) / 255
|
|
frame = einops.rearrange(frame, "h w c -> c h w")
|
|
frames.append(frame)
|
|
return torch.stack(frames)
|
|
|
|
|
|
def save_decoded_frames(
|
|
imgs_dir: Path, save_dir: Path, frames: torch.Tensor, timestamps: list[float], fps: int
|
|
) -> None:
|
|
if save_dir.exists() and len(list(save_dir.glob("frame_*.png"))) == len(timestamps):
|
|
return
|
|
|
|
save_dir.mkdir(parents=True, exist_ok=True)
|
|
for i, ts in enumerate(timestamps):
|
|
idx = int(ts * fps)
|
|
frame_hwc = (frames[i].permute((1, 2, 0)) * 255).type(torch.uint8).cpu().numpy()
|
|
PIL.Image.fromarray(frame_hwc).save(save_dir / f"frame_{idx:06d}_decoded.png")
|
|
shutil.copyfile(imgs_dir / f"frame_{idx:06d}.png", save_dir / f"frame_{idx:06d}_original.png")
|
|
|
|
|
|
def save_first_episode(imgs_dir: Path, dataset: LeRobotDataset) -> None:
|
|
ep_num_images = dataset.episode_data_index["to"][0].item()
|
|
if imgs_dir.exists() and len(list(imgs_dir.glob("frame_*.png"))) == ep_num_images:
|
|
return
|
|
|
|
imgs_dir.mkdir(parents=True, exist_ok=True)
|
|
hf_dataset = dataset.hf_dataset.with_format(None)
|
|
|
|
# We only save images from the first camera
|
|
img_keys = [key for key in hf_dataset.features if key.startswith("observation.image")]
|
|
imgs_dataset = hf_dataset.select_columns(img_keys[0])
|
|
|
|
for i, item in enumerate(
|
|
tqdm(imgs_dataset, desc=f"saving {dataset.repo_id} first episode images", leave=False)
|
|
):
|
|
img = item[img_keys[0]]
|
|
img.save(str(imgs_dir / f"frame_{i:06d}.png"), quality=100)
|
|
|
|
if i >= ep_num_images - 1:
|
|
break
|
|
|
|
|
|
def sample_timestamps(timestamps_mode: str, ep_num_images: int, fps: int) -> list[float]:
|
|
# Start at 5 to allow for 2_frames_4_space and 6_frames
|
|
idx = random.randint(5, ep_num_images - 1)
|
|
match timestamps_mode:
|
|
case "1_frame":
|
|
frame_indexes = [idx]
|
|
case "2_frames":
|
|
frame_indexes = [idx - 1, idx]
|
|
case "2_frames_4_space":
|
|
frame_indexes = [idx - 5, idx]
|
|
case "6_frames":
|
|
frame_indexes = [idx - i for i in range(6)][::-1]
|
|
case _:
|
|
raise ValueError(timestamps_mode)
|
|
|
|
return [idx / fps for idx in frame_indexes]
|
|
|
|
|
|
def decode_video_frames(
|
|
video_path: str,
|
|
timestamps: list[float],
|
|
tolerance_s: float,
|
|
backend: str,
|
|
) -> torch.Tensor:
|
|
if backend in ["pyav", "video_reader"]:
|
|
return decode_video_frames_torchvision(video_path, timestamps, tolerance_s, backend)
|
|
else:
|
|
raise NotImplementedError(backend)
|
|
|
|
|
|
def benchmark_decoding(
|
|
imgs_dir: Path,
|
|
video_path: Path,
|
|
timestamps_mode: str,
|
|
backend: str,
|
|
ep_num_images: int,
|
|
fps: int,
|
|
num_samples: int = 50,
|
|
num_workers: int = 4,
|
|
save_frames: bool = False,
|
|
) -> dict:
|
|
def process_sample(sample: int):
|
|
time_benchmark = TimeBenchmark()
|
|
timestamps = sample_timestamps(timestamps_mode, ep_num_images, fps)
|
|
num_frames = len(timestamps)
|
|
result = {
|
|
"psnr_values": [],
|
|
"ssim_values": [],
|
|
"mse_values": [],
|
|
}
|
|
|
|
with time_benchmark:
|
|
frames = decode_video_frames(video_path, timestamps=timestamps, tolerance_s=5e-1, backend=backend)
|
|
result["load_time_video_ms"] = time_benchmark.result_ms / num_frames
|
|
|
|
with time_benchmark:
|
|
original_frames = load_original_frames(imgs_dir, timestamps, fps)
|
|
result["load_time_images_ms"] = time_benchmark.result_ms / num_frames
|
|
|
|
frames_np, original_frames_np = frames.numpy(), original_frames.numpy()
|
|
for i in range(num_frames):
|
|
result["mse_values"].append(mean_squared_error(original_frames_np[i], frames_np[i]))
|
|
result["psnr_values"].append(
|
|
peak_signal_noise_ratio(original_frames_np[i], frames_np[i], data_range=1.0)
|
|
)
|
|
result["ssim_values"].append(
|
|
structural_similarity(original_frames_np[i], frames_np[i], data_range=1.0, channel_axis=0)
|
|
)
|
|
|
|
if save_frames and sample == 0:
|
|
save_dir = video_path.with_suffix("") / f"{timestamps_mode}_{backend}"
|
|
save_decoded_frames(imgs_dir, save_dir, frames, timestamps, fps)
|
|
|
|
return result
|
|
|
|
load_times_video_ms = []
|
|
load_times_images_ms = []
|
|
mse_values = []
|
|
psnr_values = []
|
|
ssim_values = []
|
|
|
|
# A sample is a single set of decoded frames specified by timestamps_mode (e.g. a single frame, 2 frames, etc.).
|
|
# For each sample, we record metrics (loading time and quality metrics) which are then averaged over all samples.
|
|
# As these samples are independent, we run them in parallel threads to speed up the benchmark.
|
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
futures = [executor.submit(process_sample, i) for i in range(num_samples)]
|
|
for future in tqdm(as_completed(futures), total=num_samples, desc="samples", leave=False):
|
|
result = future.result()
|
|
load_times_video_ms.append(result["load_time_video_ms"])
|
|
load_times_images_ms.append(result["load_time_images_ms"])
|
|
psnr_values.extend(result["psnr_values"])
|
|
ssim_values.extend(result["ssim_values"])
|
|
mse_values.extend(result["mse_values"])
|
|
|
|
avg_load_time_video_ms = float(np.array(load_times_video_ms).mean())
|
|
avg_load_time_images_ms = float(np.array(load_times_images_ms).mean())
|
|
video_images_load_time_ratio = avg_load_time_video_ms / avg_load_time_images_ms
|
|
|
|
return {
|
|
"avg_load_time_video_ms": avg_load_time_video_ms,
|
|
"avg_load_time_images_ms": avg_load_time_images_ms,
|
|
"video_images_load_time_ratio": video_images_load_time_ratio,
|
|
"avg_mse": float(np.mean(mse_values)),
|
|
"avg_psnr": float(np.mean(psnr_values)),
|
|
"avg_ssim": float(np.mean(ssim_values)),
|
|
}
|
|
|
|
|
|
def benchmark_encoding_decoding(
|
|
dataset: LeRobotDataset,
|
|
video_path: Path,
|
|
imgs_dir: Path,
|
|
encoding_cfg: dict,
|
|
decoding_cfg: dict,
|
|
num_samples: int,
|
|
num_workers: int,
|
|
save_frames: bool,
|
|
overwrite: bool = False,
|
|
seed: int = 1337,
|
|
) -> list[dict]:
|
|
fps = dataset.fps
|
|
|
|
if overwrite or not video_path.is_file():
|
|
tqdm.write(f"encoding {video_path}")
|
|
encode_video_frames(
|
|
imgs_dir=imgs_dir,
|
|
video_path=video_path,
|
|
fps=fps,
|
|
vcodec=encoding_cfg["vcodec"],
|
|
pix_fmt=encoding_cfg["pix_fmt"],
|
|
g=encoding_cfg.get("g"),
|
|
crf=encoding_cfg.get("crf"),
|
|
# fast_decode=encoding_cfg.get("fastdecode"),
|
|
overwrite=True,
|
|
)
|
|
|
|
ep_num_images = dataset.episode_data_index["to"][0].item()
|
|
width, height = tuple(dataset[0][dataset.meta.camera_keys[0]].shape[-2:])
|
|
num_pixels = width * height
|
|
video_size_bytes = video_path.stat().st_size
|
|
images_size_bytes = get_directory_size(imgs_dir)
|
|
video_images_size_ratio = video_size_bytes / images_size_bytes
|
|
|
|
random.seed(seed)
|
|
benchmark_table = []
|
|
for timestamps_mode in tqdm(
|
|
decoding_cfg["timestamps_modes"], desc="decodings (timestamps_modes)", leave=False
|
|
):
|
|
for backend in tqdm(decoding_cfg["backends"], desc="decodings (backends)", leave=False):
|
|
benchmark_row = benchmark_decoding(
|
|
imgs_dir,
|
|
video_path,
|
|
timestamps_mode,
|
|
backend,
|
|
ep_num_images,
|
|
fps,
|
|
num_samples,
|
|
num_workers,
|
|
save_frames,
|
|
)
|
|
benchmark_row.update(
|
|
**{
|
|
"repo_id": dataset.repo_id,
|
|
"resolution": f"{width} x {height}",
|
|
"num_pixels": num_pixels,
|
|
"video_size_bytes": video_size_bytes,
|
|
"images_size_bytes": images_size_bytes,
|
|
"video_images_size_ratio": video_images_size_ratio,
|
|
"timestamps_mode": timestamps_mode,
|
|
"backend": backend,
|
|
},
|
|
**encoding_cfg,
|
|
)
|
|
benchmark_table.append(benchmark_row)
|
|
|
|
return benchmark_table
|
|
|
|
|
|
def main(
|
|
output_dir: Path,
|
|
repo_ids: list[str],
|
|
vcodec: list[str],
|
|
pix_fmt: list[str],
|
|
g: list[int],
|
|
crf: list[int],
|
|
# fastdecode: list[int],
|
|
timestamps_modes: list[str],
|
|
backends: list[str],
|
|
num_samples: int,
|
|
num_workers: int,
|
|
save_frames: bool,
|
|
):
|
|
check_datasets_formats(repo_ids)
|
|
encoding_benchmarks = {
|
|
"g": g,
|
|
"crf": crf,
|
|
# "fastdecode": fastdecode,
|
|
}
|
|
decoding_benchmarks = {
|
|
"timestamps_modes": timestamps_modes,
|
|
"backends": backends,
|
|
}
|
|
headers = ["repo_id", "resolution", "num_pixels"]
|
|
headers += list(BASE_ENCODING.keys())
|
|
headers += [
|
|
"timestamps_mode",
|
|
"backend",
|
|
"video_size_bytes",
|
|
"images_size_bytes",
|
|
"video_images_size_ratio",
|
|
"avg_load_time_video_ms",
|
|
"avg_load_time_images_ms",
|
|
"video_images_load_time_ratio",
|
|
"avg_mse",
|
|
"avg_psnr",
|
|
"avg_ssim",
|
|
]
|
|
file_paths = []
|
|
for video_codec in tqdm(vcodec, desc="encodings (vcodec)"):
|
|
for pixel_format in tqdm(pix_fmt, desc="encodings (pix_fmt)", leave=False):
|
|
benchmark_table = []
|
|
for repo_id in tqdm(repo_ids, desc="encodings (datasets)", leave=False):
|
|
dataset = LeRobotDataset(repo_id)
|
|
imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_")
|
|
# We only use the first episode
|
|
save_first_episode(imgs_dir, dataset)
|
|
for key, values in tqdm(encoding_benchmarks.items(), desc="encodings (g, crf)", leave=False):
|
|
for value in tqdm(values, desc=f"encodings ({key})", leave=False):
|
|
encoding_cfg = BASE_ENCODING.copy()
|
|
encoding_cfg["vcodec"] = video_codec
|
|
encoding_cfg["pix_fmt"] = pixel_format
|
|
encoding_cfg[key] = value
|
|
args_path = Path("_".join(str(value) for value in encoding_cfg.values()))
|
|
video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4"
|
|
benchmark_table += benchmark_encoding_decoding(
|
|
dataset,
|
|
video_path,
|
|
imgs_dir,
|
|
encoding_cfg,
|
|
decoding_benchmarks,
|
|
num_samples,
|
|
num_workers,
|
|
save_frames,
|
|
)
|
|
|
|
# Save intermediate results
|
|
benchmark_df = pd.DataFrame(benchmark_table, columns=headers)
|
|
now = dt.datetime.now()
|
|
csv_path = (
|
|
output_dir
|
|
/ f"{now:%Y-%m-%d}_{now:%H-%M-%S}_{video_codec}_{pixel_format}_{num_samples}-samples.csv"
|
|
)
|
|
benchmark_df.to_csv(csv_path, header=True, index=False)
|
|
file_paths.append(csv_path)
|
|
del benchmark_df
|
|
|
|
# Concatenate all results
|
|
df_list = [pd.read_csv(csv_path) for csv_path in file_paths]
|
|
concatenated_df = pd.concat(df_list, ignore_index=True)
|
|
concatenated_path = output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_all_{num_samples}-samples.csv"
|
|
concatenated_df.to_csv(concatenated_path, header=True, index=False)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=Path,
|
|
default=Path("outputs/video_benchmark"),
|
|
help="Directory where the video benchmark outputs are written.",
|
|
)
|
|
parser.add_argument(
|
|
"--repo-ids",
|
|
type=str,
|
|
nargs="*",
|
|
default=[
|
|
"lerobot/pusht_image",
|
|
"aliberts/aloha_mobile_shrimp_image",
|
|
"aliberts/paris_street",
|
|
"aliberts/kitchen",
|
|
],
|
|
help="Datasets repo-ids to test against. First episodes only are used. Must be images.",
|
|
)
|
|
parser.add_argument(
|
|
"--vcodec",
|
|
type=str,
|
|
nargs="*",
|
|
default=["libx264", "libx265", "libsvtav1"],
|
|
help="Video codecs to be tested",
|
|
)
|
|
parser.add_argument(
|
|
"--pix-fmt",
|
|
type=str,
|
|
nargs="*",
|
|
default=["yuv444p", "yuv420p"],
|
|
help="Pixel formats (chroma subsampling) to be tested",
|
|
)
|
|
parser.add_argument(
|
|
"--g",
|
|
type=parse_int_or_none,
|
|
nargs="*",
|
|
default=[1, 2, 3, 4, 5, 6, 10, 15, 20, 40, 100, None],
|
|
help="Group of pictures sizes to be tested.",
|
|
)
|
|
parser.add_argument(
|
|
"--crf",
|
|
type=parse_int_or_none,
|
|
nargs="*",
|
|
default=[0, 5, 10, 15, 20, 25, 30, 40, 50, None],
|
|
help="Constant rate factors to be tested.",
|
|
)
|
|
# parser.add_argument(
|
|
# "--fastdecode",
|
|
# type=int,
|
|
# nargs="*",
|
|
# default=[0, 1],
|
|
# help="Use the fastdecode tuning option. 0 disables it. "
|
|
# "For libx264 and libx265, only 1 is possible. "
|
|
# "For libsvtav1, 1, 2 or 3 are possible values with a higher number meaning a faster decoding optimization",
|
|
# )
|
|
parser.add_argument(
|
|
"--timestamps-modes",
|
|
type=str,
|
|
nargs="*",
|
|
default=[
|
|
"1_frame",
|
|
"2_frames",
|
|
"2_frames_4_space",
|
|
"6_frames",
|
|
],
|
|
help="Timestamps scenarios to be tested.",
|
|
)
|
|
parser.add_argument(
|
|
"--backends",
|
|
type=str,
|
|
nargs="*",
|
|
default=["pyav", "video_reader"],
|
|
help="Torchvision decoding backend to be tested.",
|
|
)
|
|
parser.add_argument(
|
|
"--num-samples",
|
|
type=int,
|
|
default=50,
|
|
help="Number of samples for each encoding x decoding config.",
|
|
)
|
|
parser.add_argument(
|
|
"--num-workers",
|
|
type=int,
|
|
default=10,
|
|
help="Number of processes for parallelized sample processing.",
|
|
)
|
|
parser.add_argument(
|
|
"--save-frames",
|
|
type=int,
|
|
default=0,
|
|
help="Whether to save decoded frames or not. Enter a non-zero number for true.",
|
|
)
|
|
args = parser.parse_args()
|
|
main(**vars(args))
|