WIP

2024-08-01 11:50:40 +02:00 · 2024-08-01 11:50:40 +02:00 · 609531677b
parent f8a6574698
commit 609531677b
2 changed files with 130 additions and 0 deletions
--- a/dev.py
+++ b/dev.py
@ -0,0 +1,46 @@
 import os
 import time
 import numpy as np
 import torch
 # Parameters
 filename = "benchmark_data.dat"
 shape = (10000, 10000)  # Large array
 dtype = np.float32
 torch_dtype = torch.float32
 # Calculate file size
 element_size = np.dtype(dtype).itemsize
 file_size = shape[0] * shape[1] * element_size
 # Create a large file and write random data to it
 if not os.path.exists(filename) or os.path.getsize(filename) != file_size:
    data = np.random.rand(*shape).astype(dtype)
    with open(filename, "wb") as f:
        f.write(data.tobytes())
 # Benchmark numpy.memmap
 start_time = time.time()
 data_np = np.memmap(filename, dtype=dtype, mode="r", shape=shape)
 tensor_np = torch.from_numpy(data_np)
 np_load_time = time.time() - start_time
 print(f"np.memmap load time: {np_load_time:.4f} seconds")
 # Benchmark torch.UntypedStorage
 start_time = time.time()
 storage = torch.UntypedStorage.from_file(filename, shared=True, nbytes=file_size)
 tensor = torch.FloatTensor(storage).reshape(shape)
 torch_load_time = time.time() - start_time
 print(f"torch.UntypedStorage load time: {torch_load_time:.4f} seconds")
 # Set NumPy print precision
 # np.set_printoptions(precision=4)
 # Print part of the arrays to compare precision
 print("NumPy memmap array sample:\n", data_np[:5, :5])
 print("PyTorch tensor sample:\n", tensor[:5, :5].numpy())
 # Output the results
 print(f"Numpy memmap load time: {np_load_time:.4f} seconds")
 print(f"Torch UntypedStorage load time: {torch_load_time:.4f} seconds")
--- a/tests/scripts/upload_artifacts.py
+++ b/tests/scripts/upload_artifacts.py
@ -0,0 +1,84 @@
 import json
 from pathlib import Path
 from huggingface_hub import HfApi
 from lerobot import available_datasets
 from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
 api = HfApi()
 LOCAL_DIR = Path("outputs/test_artifacts/")
 # LOCAL_DIR = Path("tests/data/")
 datasets_info = api.list_datasets(author="lerobot")
 hub_available_datasets = [info.id for info in datasets_info if info.id in available_datasets]
 for repo_id in hub_available_datasets:
    print(repo_id)
    dataset_info = api.list_repo_refs(repo_id, repo_type="dataset")
    branches = [b.name for b in dataset_info.branches]
    if CODEBASE_VERSION in branches:
        # if "_image" not in repo_id:
        # print(f"{repo_id} already @{CODEBASE_VERSION}, skipping.")
        continue
    else:
        # Check if meta_data/info.json exists in the main branch
        files = api.list_repo_files(repo_id, repo_type="dataset", revision="main")
        info_file_path = "meta_data/info.json"
        if info_file_path in files:
            local_dir = LOCAL_DIR / repo_id
            local_dir.mkdir(exist_ok=True, parents=True)
            # Download the meta_data/info.json file from the main branch
            local_info_file_path = api.hf_hub_download(
                repo_id=repo_id,
                filename=info_file_path,
                revision="main",
                repo_type="dataset",
                local_dir=local_dir,
            )
        else:
            continue
        with open(local_info_file_path) as f:
            info_data = json.load(f)
        # Update the JSON data
        new_info_data = {}
        new_info_data["codebase_version"] = CODEBASE_VERSION
        for k, v in info_data.items():
            if k != "codebase_version":
                new_info_data[k] = v
        # Save the updated JSON file
        with open(local_info_file_path, "w") as f:
            json.dump(new_info_data, f, indent=4)
        # Upload the modified file to the new branch
        api.upload_file(
            path_or_fileobj=local_info_file_path,
            path_in_repo=info_file_path,
            repo_id=repo_id,
            repo_type="dataset",
            commit_message=f"Update meta_data/info.json for {CODEBASE_VERSION}",
            revision="main",
        )
        print(f"{repo_id} meta_data/info.json updated with new codebase version")
        # Now create a branch named after the new version by branching out from "main"
        # which is expected to be the preceding version
        api.create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION, revision="main")
        print(f"{repo_id} successfully updated @{CODEBASE_VERSION}")
 def main():
    # TODO: from list of repos, download:
    #     - data/
    #     - meta_data/
    #     - video/{key}_episode_000001.mp4
    ...
 if __name__ == "__main__":
    main()