From e1732b4954fe48b09ec748ea9a6b031abf154807 Mon Sep 17 00:00:00 2001
From: Jade Choghari <chogharijade@gmail.com>
Date: Tue, 4 Mar 2025 13:27:34 +0300
Subject: [PATCH] Update lerobot/common/datasets/video_utils.py

Co-authored-by: Remi <re.cadene@gmail.com>
---
 lerobot/common/datasets/video_utils.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py
index cb81a25f..39c93c4e 100644
--- a/lerobot/common/datasets/video_utils.py
+++ b/lerobot/common/datasets/video_utils.py
@@ -161,7 +161,16 @@ def decode_video_frames_torchcodec(
     device: str = "cpu",
     log_loaded_timestamps: bool = False,
 ) -> torch.Tensor:
-    """Loads frames associated with the requested timestamps of a video using torchcodec."""
+    """Loads frames associated with the requested timestamps of a video using torchcodec.
+    
+    Note: Setting device="cuda" outside the main process, e.g. in data loader workers, will lead to CUDA initialization errors.
+    
+    Note: Video benefits from inter-frame compression. Instead of storing every frame individually,
+    the encoder stores a reference frame (or a key frame) and subsequent frames as differences relative to
+    that key frame. As a consequence, to access a requested frame, we need to load the preceding key frame,
+    and all subsequent frames until reaching the requested frame. The number of key frames in a video
+    can be adjusted during encoding to take into account decoding time and video size in bytes.
+    """
     video_path = str(video_path)
     # initialize video decoder
     decoder = VideoDecoder(video_path, device=device)