From 8f821ecad0d5adca09ff1f75e933cf923feb9f25 Mon Sep 17 00:00:00 2001 From: Simon Alibert <75076266+aliberts@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:35:11 +0100 Subject: [PATCH 001/109] Fix Quality workflow (#622) --- .github/workflows/quality.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/quality.yml b/.github/workflows/quality.yml index 851869a0..c245345f 100644 --- a/.github/workflows/quality.yml +++ b/.github/workflows/quality.yml @@ -50,7 +50,7 @@ jobs: uses: actions/checkout@v3 - name: Install poetry - run: pipx install poetry + run: pipx install "poetry<2.0.0" - name: Poetry check run: poetry check @@ -64,7 +64,7 @@ jobs: uses: actions/checkout@v3 - name: Install poetry - run: pipx install poetry + run: pipx install "poetry<2.0.0" - name: Install poetry-relax run: poetry self add poetry-relax From bc16e1b497f37c5926f43c7b5ff49220a3021480 Mon Sep 17 00:00:00 2001 From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com> Date: Thu, 9 Jan 2025 03:35:27 -0500 Subject: [PATCH 002/109] fix(docs): typos in benchmark readme.md (#614) Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com> --- benchmarks/video/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/video/README.md b/benchmarks/video/README.md index 890c1142..56cd1d1e 100644 --- a/benchmarks/video/README.md +++ b/benchmarks/video/README.md @@ -21,7 +21,7 @@ How to decode videos? ## Variables **Image content & size** -We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an appartment, or in a factory, or outdoor, or with lots of moving objects in the scene, etc. Similarly, loading times might not vary linearly with the image size (resolution). +We don't expect the same optimal settings for a dataset of images from a simulation, or from real-world in an apartment, or in a factory, or outdoor, or with lots of moving objects in the scene, etc. Similarly, loading times might not vary linearly with the image size (resolution). For these reasons, we run this benchmark on four representative datasets: - `lerobot/pusht_image`: (96 x 96 pixels) simulation with simple geometric shapes, fixed camera. - `aliberts/aloha_mobile_shrimp_image`: (480 x 640 pixels) real-world indoor, moving camera. @@ -63,7 +63,7 @@ This of course is affected by the `-g` parameter during encoding, which specifie Note that this differs significantly from a typical use case like watching a movie, in which every frame is loaded sequentially from the beginning to the end and it's acceptable to have big values for `-g`. -Additionally, because some policies might request single timestamps that are a few frames appart, we also have the following scenario: +Additionally, because some policies might request single timestamps that are a few frames apart, we also have the following scenario: - `2_frames_4_space`: 2 frames with 4 consecutive frames of spacing in between (e.g `[t, t + 5 / fps]`), However, due to how video decoding is implemented with `pyav`, we don't have access to an accurate seek so in practice this scenario is essentially the same as `6_frames` since all 6 frames between `t` and `t + 5 / fps` will be decoded. @@ -85,8 +85,8 @@ However, due to how video decoding is implemented with `pyav`, we don't have acc **Average Structural Similarity Index Measure (higher is better)** `avg_ssim` evaluates the perceived quality of images by comparing luminance, contrast, and structure. SSIM values range from -1 to 1, where 1 indicates perfect similarity. -One aspect that can't be measured here with those metrics is the compatibility of the encoding accross platforms, in particular on web browser, for visualization purposes. -h264, h265 and AV1 are all commonly used codecs and should not be pose an issue. However, the chroma subsampling (`pix_fmt`) format might affect compatibility: +One aspect that can't be measured here with those metrics is the compatibility of the encoding across platforms, in particular on web browser, for visualization purposes. +h264, h265 and AV1 are all commonly used codecs and should not pose an issue. However, the chroma subsampling (`pix_fmt`) format might affect compatibility: - `yuv420p` is more widely supported across various platforms, including web browsers. - `yuv444p` offers higher color fidelity but might not be supported as broadly. @@ -116,7 +116,7 @@ Additional encoding parameters exist that are not included in this benchmark. In - `-preset` which allows for selecting encoding presets. This represents a collection of options that will provide a certain encoding speed to compression ratio. By leaving this parameter unspecified, it is considered to be `medium` for libx264 and libx265 and `8` for libsvtav1. - `-tune` which allows to optimize the encoding for certains aspects (e.g. film quality, fast decoding, etc.). -See the documentation mentioned above for more detailled info on these settings and for a more comprehensive list of other parameters. +See the documentation mentioned above for more detailed info on these settings and for a more comprehensive list of other parameters. Similarly on the decoding side, other decoders exist but are not implemented in our current benchmark. To name a few: - `torchaudio` From 5097cd900e8a471f1224ed1e2302093f40d5ed79 Mon Sep 17 00:00:00 2001 From: Ville Kuosmanen Date: Thu, 9 Jan 2025 08:39:48 +0000 Subject: [PATCH 003/109] fix(visualise): use correct language description for each episode id (#604) Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com> --- lerobot/scripts/visualize_dataset_html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lerobot/scripts/visualize_dataset_html.py b/lerobot/scripts/visualize_dataset_html.py index ec6eca22..39b4c27d 100644 --- a/lerobot/scripts/visualize_dataset_html.py +++ b/lerobot/scripts/visualize_dataset_html.py @@ -177,7 +177,7 @@ def run_server( {"url": url_for("static", filename=video_path), "filename": video_path.parent.name} for video_path in video_paths ] - tasks = dataset.meta.episodes[0]["tasks"] + tasks = dataset.meta.episodes[episode_id]["tasks"] else: video_keys = [key for key, ft in dataset.features.items() if ft["dtype"] == "video"] videos_info = [ From b8b368310cf47433ef583957aeda525755c0e9ab Mon Sep 17 00:00:00 2001 From: CharlesCNorton <135471798+CharlesCNorton@users.noreply.github.com> Date: Thu, 9 Jan 2025 03:57:45 -0500 Subject: [PATCH 004/109] typo fix: batch_convert_dataset_v1_to_v2.py (#615) Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com> --- lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py index c8da2fe1..eeeb8fe7 100644 --- a/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py +++ b/lerobot/common/datasets/v2/batch_convert_dataset_v1_to_v2.py @@ -159,11 +159,11 @@ DATASETS = { **ALOHA_STATIC_INFO, }, "aloha_static_vinh_cup": { - "single_task": "Pick up the platic cup with the right arm, then pop its lid open with the left arm.", + "single_task": "Pick up the plastic cup with the right arm, then pop its lid open with the left arm.", **ALOHA_STATIC_INFO, }, "aloha_static_vinh_cup_left": { - "single_task": "Pick up the platic cup with the left arm, then pop its lid open with the right arm.", + "single_task": "Pick up the plastic cup with the left arm, then pop its lid open with the right arm.", **ALOHA_STATIC_INFO, }, "aloha_static_ziploc_slide": {"single_task": "Slide open the ziploc bag.", **ALOHA_STATIC_INFO}, From 25a8597680fb46e53fbc7c962cd2e84c8beacc3f Mon Sep 17 00:00:00 2001 From: Mishig Date: Thu, 9 Jan 2025 11:39:54 +0100 Subject: [PATCH 005/109] [viz] Fixes & updates to html visualizer (#617) --- lerobot/scripts/visualize_dataset_html.py | 71 +++++++------------ .../templates/visualize_dataset_template.html | 31 +++++++- 2 files changed, 56 insertions(+), 46 deletions(-) diff --git a/lerobot/scripts/visualize_dataset_html.py b/lerobot/scripts/visualize_dataset_html.py index 39b4c27d..cc3f3930 100644 --- a/lerobot/scripts/visualize_dataset_html.py +++ b/lerobot/scripts/visualize_dataset_html.py @@ -232,69 +232,54 @@ def get_episode_data(dataset: LeRobotDataset | IterableNamespace, episode_index) """Get a csv str containing timeseries data of an episode (e.g. state and action). This file will be loaded by Dygraph javascript to plot data in real time.""" columns = [] - has_state = "observation.state" in dataset.features - has_action = "action" in dataset.features + + selected_columns = [col for col, ft in dataset.features.items() if ft["dtype"] == "float32"] + selected_columns.remove("timestamp") # init header of csv with state and action names header = ["timestamp"] - if has_state: + + for column_name in selected_columns: dim_state = ( - dataset.meta.shapes["observation.state"][0] + dataset.meta.shapes[column_name][0] if isinstance(dataset, LeRobotDataset) - else dataset.features["observation.state"].shape[0] + else dataset.features[column_name].shape[0] ) - header += [f"state_{i}" for i in range(dim_state)] - column_names = dataset.features["observation.state"]["names"] - while not isinstance(column_names, list): - column_names = list(column_names.values())[0] - columns.append({"key": "state", "value": column_names}) - if has_action: - dim_action = ( - dataset.meta.shapes["action"][0] - if isinstance(dataset, LeRobotDataset) - else dataset.features.action.shape[0] - ) - header += [f"action_{i}" for i in range(dim_action)] - column_names = dataset.features["action"]["names"] - while not isinstance(column_names, list): - column_names = list(column_names.values())[0] - columns.append({"key": "action", "value": column_names}) + header += [f"{column_name}_{i}" for i in range(dim_state)] + + if "names" in dataset.features[column_name] and dataset.features[column_name]["names"]: + column_names = dataset.features[column_name]["names"] + while not isinstance(column_names, list): + column_names = list(column_names.values())[0] + else: + column_names = [f"motor_{i}" for i in range(dim_state)] + columns.append({"key": column_name, "value": column_names}) + + selected_columns.insert(0, "timestamp") if isinstance(dataset, LeRobotDataset): from_idx = dataset.episode_data_index["from"][episode_index] to_idx = dataset.episode_data_index["to"][episode_index] - selected_columns = ["timestamp"] - if has_state: - selected_columns += ["observation.state"] - if has_action: - selected_columns += ["action"] data = ( dataset.hf_dataset.select(range(from_idx, to_idx)) .select_columns(selected_columns) - .with_format("numpy") + .with_format("pandas") ) - rows = np.hstack( - (np.expand_dims(data["timestamp"], axis=1), *[data[col] for col in selected_columns[1:]]) - ).tolist() else: repo_id = dataset.repo_id - selected_columns = ["timestamp"] - if "observation.state" in dataset.features: - selected_columns.append("observation.state") - if "action" in dataset.features: - selected_columns.append("action") url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/" + dataset.data_path.format( episode_chunk=int(episode_index) // dataset.chunks_size, episode_index=episode_index ) df = pd.read_parquet(url) data = df[selected_columns] # Select specific columns - rows = np.hstack( - ( - np.expand_dims(data["timestamp"], axis=1), - *[np.vstack(data[col]) for col in selected_columns[1:]], - ) - ).tolist() + + rows = np.hstack( + ( + np.expand_dims(data["timestamp"], axis=1), + *[np.vstack(data[col]) for col in selected_columns[1:]], + ) + ).tolist() # Convert data to CSV string csv_buffer = StringIO() @@ -379,10 +364,6 @@ def visualize_dataset_html( template_folder=template_dir, ) else: - image_keys = dataset.meta.image_keys if isinstance(dataset, LeRobotDataset) else [] - if len(image_keys) > 0: - raise NotImplementedError(f"Image keys ({image_keys=}) are currently not supported.") - # Create a simlink from the dataset video folder containg mp4 files to the output directory # so that the http server can get access to the mp4 files. if isinstance(dataset, LeRobotDataset): diff --git a/lerobot/templates/visualize_dataset_template.html b/lerobot/templates/visualize_dataset_template.html index 12d6e991..3c93d2d6 100644 --- a/lerobot/templates/visualize_dataset_template.html +++ b/lerobot/templates/visualize_dataset_template.html @@ -98,9 +98,34 @@ +
+
+ filter videos +
🔽
+
+ +
+
+ +
+
+
+
{% for video_info in videos_info %} -
+

{{ video_info.filename }}

- -