From 609531677b5cb5311ebb3f7e480d8a3dba034a68 Mon Sep 17 00:00:00 2001 From: Simon Alibert Date: Thu, 1 Aug 2024 11:50:40 +0200 Subject: [PATCH] WIP --- dev.py | 46 +++++++++++++++++ tests/scripts/upload_artifacts.py | 84 +++++++++++++++++++++++++++++++ 2 files changed, 130 insertions(+) create mode 100644 dev.py create mode 100644 tests/scripts/upload_artifacts.py diff --git a/dev.py b/dev.py new file mode 100644 index 00000000..ad0486cc --- /dev/null +++ b/dev.py @@ -0,0 +1,46 @@ +import os +import time + +import numpy as np +import torch + +# Parameters +filename = "benchmark_data.dat" +shape = (10000, 10000) # Large array +dtype = np.float32 +torch_dtype = torch.float32 + +# Calculate file size +element_size = np.dtype(dtype).itemsize +file_size = shape[0] * shape[1] * element_size + +# Create a large file and write random data to it +if not os.path.exists(filename) or os.path.getsize(filename) != file_size: + data = np.random.rand(*shape).astype(dtype) + with open(filename, "wb") as f: + f.write(data.tobytes()) + +# Benchmark numpy.memmap +start_time = time.time() +data_np = np.memmap(filename, dtype=dtype, mode="r", shape=shape) +tensor_np = torch.from_numpy(data_np) +np_load_time = time.time() - start_time +print(f"np.memmap load time: {np_load_time:.4f} seconds") + +# Benchmark torch.UntypedStorage +start_time = time.time() +storage = torch.UntypedStorage.from_file(filename, shared=True, nbytes=file_size) +tensor = torch.FloatTensor(storage).reshape(shape) +torch_load_time = time.time() - start_time +print(f"torch.UntypedStorage load time: {torch_load_time:.4f} seconds") + +# Set NumPy print precision +# np.set_printoptions(precision=4) + +# Print part of the arrays to compare precision +print("NumPy memmap array sample:\n", data_np[:5, :5]) +print("PyTorch tensor sample:\n", tensor[:5, :5].numpy()) + +# Output the results +print(f"Numpy memmap load time: {np_load_time:.4f} seconds") +print(f"Torch UntypedStorage load time: {torch_load_time:.4f} seconds") diff --git a/tests/scripts/upload_artifacts.py b/tests/scripts/upload_artifacts.py new file mode 100644 index 00000000..82bc7f9a --- /dev/null +++ b/tests/scripts/upload_artifacts.py @@ -0,0 +1,84 @@ +import json +from pathlib import Path + +from huggingface_hub import HfApi + +from lerobot import available_datasets +from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION + +api = HfApi() +LOCAL_DIR = Path("outputs/test_artifacts/") +# LOCAL_DIR = Path("tests/data/") + +datasets_info = api.list_datasets(author="lerobot") +hub_available_datasets = [info.id for info in datasets_info if info.id in available_datasets] + +for repo_id in hub_available_datasets: + print(repo_id) + + dataset_info = api.list_repo_refs(repo_id, repo_type="dataset") + branches = [b.name for b in dataset_info.branches] + if CODEBASE_VERSION in branches: + # if "_image" not in repo_id: + # print(f"{repo_id} already @{CODEBASE_VERSION}, skipping.") + continue + else: + # Check if meta_data/info.json exists in the main branch + files = api.list_repo_files(repo_id, repo_type="dataset", revision="main") + info_file_path = "meta_data/info.json" + + if info_file_path in files: + local_dir = LOCAL_DIR / repo_id + local_dir.mkdir(exist_ok=True, parents=True) + # Download the meta_data/info.json file from the main branch + local_info_file_path = api.hf_hub_download( + repo_id=repo_id, + filename=info_file_path, + revision="main", + repo_type="dataset", + local_dir=local_dir, + ) + else: + continue + + with open(local_info_file_path) as f: + info_data = json.load(f) + + # Update the JSON data + new_info_data = {} + new_info_data["codebase_version"] = CODEBASE_VERSION + for k, v in info_data.items(): + if k != "codebase_version": + new_info_data[k] = v + + # Save the updated JSON file + with open(local_info_file_path, "w") as f: + json.dump(new_info_data, f, indent=4) + + # Upload the modified file to the new branch + api.upload_file( + path_or_fileobj=local_info_file_path, + path_in_repo=info_file_path, + repo_id=repo_id, + repo_type="dataset", + commit_message=f"Update meta_data/info.json for {CODEBASE_VERSION}", + revision="main", + ) + print(f"{repo_id} meta_data/info.json updated with new codebase version") + + # Now create a branch named after the new version by branching out from "main" + # which is expected to be the preceding version + api.create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION, revision="main") + print(f"{repo_id} successfully updated @{CODEBASE_VERSION}") + + +def main(): + # TODO: from list of repos, download: + # - data/ + # - meta_data/ + # - video/{key}_episode_000001.mp4 + ... + + +if __name__ == "__main__": + main()