From b9b880bd8bf6a940fa887414dfc707b2342e7ac6 Mon Sep 17 00:00:00 2001 From: Remi Cadene Date: Mon, 21 Apr 2025 12:59:35 +0000 Subject: [PATCH] fix get_parquet_file_size_in_mb + DEFAULT_FILE_SIZE_IN_MB=100 --- lerobot/common/datasets/utils.py | 11 ++++++++--- lerobot/common/datasets/video_utils.py | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/lerobot/common/datasets/utils.py b/lerobot/common/datasets/utils.py index 48c3dacc..f404113e 100644 --- a/lerobot/common/datasets/utils.py +++ b/lerobot/common/datasets/utils.py @@ -50,7 +50,7 @@ from lerobot.common.utils.utils import is_valid_numpy_dtype_string from lerobot.configs.types import FeatureType, PolicyFeature DEFAULT_CHUNK_SIZE = 1000 # Max number of files per chunk -DEFAULT_FILE_SIZE_IN_MB = 500.0 # Max size per file +DEFAULT_FILE_SIZE_IN_MB = 100.0 # Max size per file INFO_PATH = "meta/info.json" STATS_PATH = "meta/stats.json" @@ -87,8 +87,13 @@ DEFAULT_FEATURES = { def get_parquet_file_size_in_mb(parquet_path): metadata = pq.read_metadata(parquet_path) - uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size - return uncompressed_size / (1024**2) + total_uncompressed_size = 0 + for row_group in range(metadata.num_row_groups): + rg_metadata = metadata.row_group(row_group) + for column in range(rg_metadata.num_columns): + col_metadata = rg_metadata.column(column) + total_uncompressed_size += col_metadata.total_uncompressed_size + return total_uncompressed_size / (1024**2) def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int: diff --git a/lerobot/common/datasets/video_utils.py b/lerobot/common/datasets/video_utils.py index 82beb876..a0dd2544 100644 --- a/lerobot/common/datasets/video_utils.py +++ b/lerobot/common/datasets/video_utils.py @@ -264,7 +264,7 @@ def encode_video_frames( [ ("-f", "image2"), ("-r", str(fps)), - ("-i", str(imgs_dir / "frame_%06d.png")), + ("-i", str(imgs_dir / "frame-%06d.png")), ("-vcodec", vcodec), ("-pix_fmt", pix_fmt), ]