fix get_parquet_file_size_in_mb + DEFAULT_FILE_SIZE_IN_MB=100

This commit is contained in:
Remi Cadene 2025-04-21 12:59:35 +00:00
parent 4375a05a9f
commit b9b880bd8b
2 changed files with 9 additions and 4 deletions

View File

@ -50,7 +50,7 @@ from lerobot.common.utils.utils import is_valid_numpy_dtype_string
from lerobot.configs.types import FeatureType, PolicyFeature from lerobot.configs.types import FeatureType, PolicyFeature
DEFAULT_CHUNK_SIZE = 1000 # Max number of files per chunk DEFAULT_CHUNK_SIZE = 1000 # Max number of files per chunk
DEFAULT_FILE_SIZE_IN_MB = 500.0 # Max size per file DEFAULT_FILE_SIZE_IN_MB = 100.0 # Max size per file
INFO_PATH = "meta/info.json" INFO_PATH = "meta/info.json"
STATS_PATH = "meta/stats.json" STATS_PATH = "meta/stats.json"
@ -87,8 +87,13 @@ DEFAULT_FEATURES = {
def get_parquet_file_size_in_mb(parquet_path): def get_parquet_file_size_in_mb(parquet_path):
metadata = pq.read_metadata(parquet_path) metadata = pq.read_metadata(parquet_path)
uncompressed_size = metadata.num_rows * metadata.row_group(0).total_byte_size total_uncompressed_size = 0
return uncompressed_size / (1024**2) for row_group in range(metadata.num_row_groups):
rg_metadata = metadata.row_group(row_group)
for column in range(rg_metadata.num_columns):
col_metadata = rg_metadata.column(column)
total_uncompressed_size += col_metadata.total_uncompressed_size
return total_uncompressed_size / (1024**2)
def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int: def get_hf_dataset_size_in_mb(hf_ds: Dataset) -> int:

View File

@ -264,7 +264,7 @@ def encode_video_frames(
[ [
("-f", "image2"), ("-f", "image2"),
("-r", str(fps)), ("-r", str(fps)),
("-i", str(imgs_dir / "frame_%06d.png")), ("-i", str(imgs_dir / "frame-%06d.png")),
("-vcodec", vcodec), ("-vcodec", vcodec),
("-pix_fmt", pix_fmt), ("-pix_fmt", pix_fmt),
] ]