Adding dora-record script

This commit is contained in:
haixuantao 2024-05-18 12:41:10 +02:00
parent 096149b118
commit d39a254d4d
1 changed files with 82 additions and 0 deletions

View File

@ -0,0 +1,82 @@
#!/usr/bin/env python
"""
Contains utilities to process raw data format from dora-record
"""
from pathlib import Path
from datasets import Dataset
import pandas as pd
def check_format(raw_dir) -> bool:
leader_file = list(raw_dir.glob("*_leader.parquet"))
if len(leader_file) != 1:
raise ValueError(
f"Issues with leader file in {raw_dir}. Make sure there is one and only one leader file"
)
return True
def load_from_raw(raw_dir: Path, out_dir=None, fps=30, video=True, debug=False):
parquet_files = list(raw_dir.glob("*.parquet"))
leader_file = list(raw_dir.glob("*_leader.parquet"))[0]
# Remove leader file from parquet files
parquet_files = [x for x in parquet_files if x != leader_file]
## Load leader data
data_df = pd.read_parquet(leader_file)
data_df = data_df[["timestamp_utc", leader_file.stem]]
## Merge all data using nearest backward strategy
for data in parquet_files:
df = pd.read_parquet(data)
data_df = pd.merge_asof(
data_df,
df[["timestamp_utc", data.stem]],
on="timestamp_utc",
direction="backward",
)
data_df["episode_index"] = data_df["episode_index"].map(lambda x: x[0])
# Get the episode index containing for each unique episode index
episode_data_index = data_df["episode_index"].drop_duplicates().reset_index()
episode_data_index["from"] = episode_data_index["index"]
episode_data_index["to"] = episode_data_index["index"].shift(-1)
# Remove column index
episode_data_index = episode_data_index.drop(columns=["index"])
# episode_data_index to dict
episode_data_index = episode_data_index.to_dict(orient="list")
return data_df, episode_data_index
def to_hf_dataset(df, video) -> Dataset:
hf_dataset = Dataset.from_pandas(df)
return hf_dataset
def from_raw_to_lerobot_format(
raw_dir: Path, out_dir: Path, fps=None, video=True, debug=False
):
# sanity check
check_format(raw_dir)
if fps is None:
fps = 30
data_df, episode_data_index = load_from_raw(raw_dir, out_dir, fps, video, debug)
hf_dataset = to_hf_dataset(data_df, video)
info = {
"fps": fps,
"video": video,
}
return hf_dataset, episode_data_index, info