From 609531677b5cb5311ebb3f7e480d8a3dba034a68 Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Thu, 1 Aug 2024 11:50:40 +0200
Subject: [PATCH] WIP

---
 dev.py                            | 46 +++++++++++++++++
 tests/scripts/upload_artifacts.py | 84 +++++++++++++++++++++++++++++++
 2 files changed, 130 insertions(+)
 create mode 100644 dev.py
 create mode 100644 tests/scripts/upload_artifacts.py

diff --git a/dev.py b/dev.py
new file mode 100644
index 00000000..ad0486cc
--- /dev/null
+++ b/dev.py
@@ -0,0 +1,46 @@
+import os
+import time
+
+import numpy as np
+import torch
+
+# Parameters
+filename = "benchmark_data.dat"
+shape = (10000, 10000)  # Large array
+dtype = np.float32
+torch_dtype = torch.float32
+
+# Calculate file size
+element_size = np.dtype(dtype).itemsize
+file_size = shape[0] * shape[1] * element_size
+
+# Create a large file and write random data to it
+if not os.path.exists(filename) or os.path.getsize(filename) != file_size:
+    data = np.random.rand(*shape).astype(dtype)
+    with open(filename, "wb") as f:
+        f.write(data.tobytes())
+
+# Benchmark numpy.memmap
+start_time = time.time()
+data_np = np.memmap(filename, dtype=dtype, mode="r", shape=shape)
+tensor_np = torch.from_numpy(data_np)
+np_load_time = time.time() - start_time
+print(f"np.memmap load time: {np_load_time:.4f} seconds")
+
+# Benchmark torch.UntypedStorage
+start_time = time.time()
+storage = torch.UntypedStorage.from_file(filename, shared=True, nbytes=file_size)
+tensor = torch.FloatTensor(storage).reshape(shape)
+torch_load_time = time.time() - start_time
+print(f"torch.UntypedStorage load time: {torch_load_time:.4f} seconds")
+
+# Set NumPy print precision
+# np.set_printoptions(precision=4)
+
+# Print part of the arrays to compare precision
+print("NumPy memmap array sample:\n", data_np[:5, :5])
+print("PyTorch tensor sample:\n", tensor[:5, :5].numpy())
+
+# Output the results
+print(f"Numpy memmap load time: {np_load_time:.4f} seconds")
+print(f"Torch UntypedStorage load time: {torch_load_time:.4f} seconds")
diff --git a/tests/scripts/upload_artifacts.py b/tests/scripts/upload_artifacts.py
new file mode 100644
index 00000000..82bc7f9a
--- /dev/null
+++ b/tests/scripts/upload_artifacts.py
@@ -0,0 +1,84 @@
+import json
+from pathlib import Path
+
+from huggingface_hub import HfApi
+
+from lerobot import available_datasets
+from lerobot.common.datasets.lerobot_dataset import CODEBASE_VERSION
+
+api = HfApi()
+LOCAL_DIR = Path("outputs/test_artifacts/")
+# LOCAL_DIR = Path("tests/data/")
+
+datasets_info = api.list_datasets(author="lerobot")
+hub_available_datasets = [info.id for info in datasets_info if info.id in available_datasets]
+
+for repo_id in hub_available_datasets:
+    print(repo_id)
+
+    dataset_info = api.list_repo_refs(repo_id, repo_type="dataset")
+    branches = [b.name for b in dataset_info.branches]
+    if CODEBASE_VERSION in branches:
+        # if "_image" not in repo_id:
+        # print(f"{repo_id} already @{CODEBASE_VERSION}, skipping.")
+        continue
+    else:
+        # Check if meta_data/info.json exists in the main branch
+        files = api.list_repo_files(repo_id, repo_type="dataset", revision="main")
+        info_file_path = "meta_data/info.json"
+
+        if info_file_path in files:
+            local_dir = LOCAL_DIR / repo_id
+            local_dir.mkdir(exist_ok=True, parents=True)
+            # Download the meta_data/info.json file from the main branch
+            local_info_file_path = api.hf_hub_download(
+                repo_id=repo_id,
+                filename=info_file_path,
+                revision="main",
+                repo_type="dataset",
+                local_dir=local_dir,
+            )
+        else:
+            continue
+
+        with open(local_info_file_path) as f:
+            info_data = json.load(f)
+
+        # Update the JSON data
+        new_info_data = {}
+        new_info_data["codebase_version"] = CODEBASE_VERSION
+        for k, v in info_data.items():
+            if k != "codebase_version":
+                new_info_data[k] = v
+
+        # Save the updated JSON file
+        with open(local_info_file_path, "w") as f:
+            json.dump(new_info_data, f, indent=4)
+
+        # Upload the modified file to the new branch
+        api.upload_file(
+            path_or_fileobj=local_info_file_path,
+            path_in_repo=info_file_path,
+            repo_id=repo_id,
+            repo_type="dataset",
+            commit_message=f"Update meta_data/info.json for {CODEBASE_VERSION}",
+            revision="main",
+        )
+        print(f"{repo_id} meta_data/info.json updated with new codebase version")
+
+        # Now create a branch named after the new version by branching out from "main"
+        # which is expected to be the preceding version
+        api.create_branch(repo_id, repo_type="dataset", branch=CODEBASE_VERSION, revision="main")
+        print(f"{repo_id} successfully updated @{CODEBASE_VERSION}")
+
+
+def main():
+    # TODO: from list of repos, download:
+    #     - data/
+    #     - meta_data/
+    #     - video/{key}_episode_000001.mp4
+    ...
+
+
+if __name__ == "__main__":
+    main()