Adding dora-record script

2024-05-18 12:41:10 +02:00 · 2024-05-18 12:41:10 +02:00 · d39a254d4d
parent 096149b118
commit d39a254d4d
1 changed files with 82 additions and 0 deletions
--- a/lerobot/common/datasets/push_dataset_to_hub/dora_record_format.py
+++ b/lerobot/common/datasets/push_dataset_to_hub/dora_record_format.py
@ -0,0 +1,82 @@
+#!/usr/bin/env python
+"""
+Contains utilities to process raw data format from dora-record
+"""
+
+from pathlib import Path
+
+from datasets import Dataset
+import pandas as pd
+
+
+def check_format(raw_dir) -> bool:
+
+    leader_file = list(raw_dir.glob("*_leader.parquet"))
+
+    if len(leader_file) != 1:
+        raise ValueError(
+            f"Issues with leader file in {raw_dir}. Make sure there is one and only one leader file"
+        )
+    return True
+
+
+def load_from_raw(raw_dir: Path, out_dir=None, fps=30, video=True, debug=False):
+
+    parquet_files = list(raw_dir.glob("*.parquet"))
+    leader_file = list(raw_dir.glob("*_leader.parquet"))[0]
+
+    # Remove leader file from parquet files
+    parquet_files = [x for x in parquet_files if x != leader_file]
+
+    ## Load leader data
+    data_df = pd.read_parquet(leader_file)
+    data_df = data_df[["timestamp_utc", leader_file.stem]]
+
+    ## Merge all data using nearest backward strategy
+    for data in parquet_files:
+        df = pd.read_parquet(data)
+        data_df = pd.merge_asof(
+            data_df,
+            df[["timestamp_utc", data.stem]],
+            on="timestamp_utc",
+            direction="backward",
+        )
+    data_df["episode_index"] = data_df["episode_index"].map(lambda x: x[0])
+
+    # Get the episode index containing for each unique episode index
+    episode_data_index = data_df["episode_index"].drop_duplicates().reset_index()
+    episode_data_index["from"] = episode_data_index["index"]
+    episode_data_index["to"] = episode_data_index["index"].shift(-1)
+
+    # Remove column index
+    episode_data_index = episode_data_index.drop(columns=["index"])
+
+    # episode_data_index to dict
+    episode_data_index = episode_data_index.to_dict(orient="list")
+
+    return data_df, episode_data_index
+
+
+def to_hf_dataset(df, video) -> Dataset:
+
+    hf_dataset = Dataset.from_pandas(df)
+    return hf_dataset
+
+
+def from_raw_to_lerobot_format(
+    raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False
+):
+    # sanity check
+    check_format(raw_dir)
+
+    if fps is None:
+        fps = 30
+
+    data_df, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
+    hf_dataset = to_hf_dataset(data_df, video)
+
+    info = {
+        "fps": fps,
+        "video": video,
+    }
+    return hf_dataset, episode_data_index, info