diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..645f8a78 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,20 @@ +{ + "name": "Maniskill Dev Container", + "image": "maniskill", + "workspaceFolder": "/lerobot", + "mounts": [ + "source=${localWorkspaceFolder},target=/lerobot,type=bind,consistency=cached" + ], + "runArgs": [ + "--network", "host", + "--gpus", "all", + "--runtime=nvidia" + ], + "settings": { + "terminal.integrated.defaultProfile.linux": "bash" + }, + "extensions": [ + "ms-python.python", + "ms-vscode.cpptools" + ] +} \ No newline at end of file diff --git a/docker/manyskill-lerobot-gpu/10_nvidia.json b/docker/manyskill-lerobot-gpu/10_nvidia.json new file mode 100644 index 00000000..2eac23bb --- /dev/null +++ b/docker/manyskill-lerobot-gpu/10_nvidia.json @@ -0,0 +1,6 @@ +{ + "file_format_version" : "1.0.0", + "ICD" : { + "library_path" : "libEGL_nvidia.so.0" + } +} \ No newline at end of file diff --git a/docker/manyskill-lerobot-gpu/Dockerfile b/docker/manyskill-lerobot-gpu/Dockerfile new file mode 100644 index 00000000..cca4a248 --- /dev/null +++ b/docker/manyskill-lerobot-gpu/Dockerfile @@ -0,0 +1,65 @@ +# Use the Nvidia base image +FROM nvidia/cudagl:11.3.1-devel-ubuntu20.04 +ENV NVIDIA_DRIVER_CAPABILITIES=all +ARG PYTHON_VERSION=3.10 + +# Install os-level packages +RUN apt-get update && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ + bash-completion \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + git \ + git-lfs \ + htop \ + libegl1 \ + libxext6 \ + libjpeg-dev \ + libpng-dev \ + libvulkan1 \ + rsync \ + tmux \ + unzip \ + vim \ + vulkan-utils \ + wget \ + xvfb \ + libglib2.0-0 \ + libgl1-mesa-glx \ + libegl1-mesa \ + ffmpeg \ + build-essential \ + cmake \ + portaudio19-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install (mini) conda +RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ + chmod +x ~/miniconda.sh && \ + ~/miniconda.sh -b -p /opt/conda && \ + rm ~/miniconda.sh && \ + /opt/conda/bin/conda init && \ + /opt/conda/bin/conda install -y python="$PYTHON_VERSION" && \ + /opt/conda/bin/conda clean -ya + +ENV PATH=/opt/conda/bin:$PATH +SHELL ["/bin/bash", "-c"] + +# https://github.com/haosulab/ManiSkill/issues/9 +# Install Poetry +RUN curl -sSL https://install.python-poetry.org | python3 - +ENV PATH="/root/.local/bin:${PATH}" +# Copy Vulkan JSON files +COPY docker/manyskill-lerobot-gpu/nvidia_icd.json /usr/share/vulkan/icd.d/nvidia_icd.json +COPY docker/manyskill-lerobot-gpu/nvidia_layers.json /etc/vulkan/implicit_layer.d/nvidia_layers.json + +# Install LeRobot +COPY . /lerobot +WORKDIR /lerobot +RUN poetry install --sync --all-extras +RUN pip install --upgrade mani-skill==3.0.0.b15 && pip cache purge + +# Download PhysX GPU binary +RUN python -c "exec('import sapien.physx as physx;\ntry:\n physx.enable_gpu()\nexcept:\n pass;')" diff --git a/docker/manyskill-lerobot-gpu/nvidia_icd.json b/docker/manyskill-lerobot-gpu/nvidia_icd.json new file mode 100644 index 00000000..e7d75b24 --- /dev/null +++ b/docker/manyskill-lerobot-gpu/nvidia_icd.json @@ -0,0 +1,7 @@ +{ + "file_format_version" : "1.0.0", + "ICD": { + "library_path": "libGLX_nvidia.so.0", + "api_version" : "1.2.155" + } +} \ No newline at end of file diff --git a/docker/manyskill-lerobot-gpu/nvidia_layers.json b/docker/manyskill-lerobot-gpu/nvidia_layers.json new file mode 100644 index 00000000..220cb42e --- /dev/null +++ b/docker/manyskill-lerobot-gpu/nvidia_layers.json @@ -0,0 +1,21 @@ +{ + "file_format_version" : "1.0.0", + "layer": { + "name": "VK_LAYER_NV_optimus", + "type": "INSTANCE", + "library_path": "libGLX_nvidia.so.0", + "api_version" : "1.2.155", + "implementation_version" : "1", + "description" : "NVIDIA Optimus layer", + "functions": { + "vkGetInstanceProcAddr": "vk_optimusGetInstanceProcAddr", + "vkGetDeviceProcAddr": "vk_optimusGetDeviceProcAddr" + }, + "enable_environment": { + "__NV_PRIME_RENDER_OFFLOAD": "1" + }, + "disable_environment": { + "DISABLE_LAYER_NV_OPTIMUS_1": "" + } + } +} \ No newline at end of file diff --git a/lerobot/scripts/server/benchmark_buffer.py b/lerobot/scripts/server/benchmark_buffer.py new file mode 100644 index 00000000..c83f422f --- /dev/null +++ b/lerobot/scripts/server/benchmark_buffer.py @@ -0,0 +1,327 @@ +#!/usr/bin/env python +""" +Benchmark ReplayBuffer performance with different device configurations. + +This script compares performance of ReplayBuffer across different configurations: +1. Pure GPU mode (storage and computation on GPU) +2. Mixed mode (storage on CPU, computation on GPU) + +For each configuration, it benchmarks: +- The add method (adding transitions to the buffer) +- The sample method with a batch size of 512 +""" + +import time +import torch +import argparse +import numpy as np +import matplotlib.pyplot as plt +from typing import Dict, List, Tuple, Optional +from collections import defaultdict + +from lerobot.scripts.server.buffer import ReplayBuffer + +def generate_random_transition( + image_shape: Tuple[int, int, int] = (3, 224, 224), + state_shape: Tuple[int, ...] = (10,), + action_shape: Tuple[int, ...] = (6,), + device: str = "cpu" +) -> Tuple[Dict[str, torch.Tensor], torch.Tensor, float, Dict[str, torch.Tensor], bool, bool]: + """Generate a random transition for testing.""" + # Create state with both image and vector components + state = { + "observation.image": torch.randn(1, *image_shape, device=device), + "observation.vector": torch.randn(1, *state_shape, device=device) + } + + # Create next_state with same structure + next_state = { + "observation.image": torch.randn(1, *image_shape, device=device), + "observation.vector": torch.randn(1, *state_shape, device=device) + } + + # Create random action, reward, done flag + action = torch.randn(1, *action_shape, device=device) + reward = float(torch.rand(1).item()) + done = bool(torch.rand(1) > 0.9) # 10% chance of being done + truncated = bool(torch.rand(1) > 0.95) # 5% chance of being truncated + + return state, action, reward, next_state, done, truncated + +def warm_up_gpu(): + """Warm up the GPU to ensure consistent benchmarking.""" + if torch.cuda.is_available(): + # Run some operations to warm up the GPU + print("Warming up GPU...") + x = torch.randn(1000, 1000, device="cuda") + for _ in range(10): + x = torch.matmul(x, x) + # Clear cache + torch.cuda.empty_cache() + print("GPU warm-up complete") + +def benchmark_buffer( + capacity: int = 100000, + add_count: int = 10000, + sample_count: int = 100, + batch_size: int = 512, + image_shape: Tuple[int, int, int] = (3, 224, 224), + state_shape: Tuple[int, ...] = (10,), + action_shape: Tuple[int, ...] = (6,), + configs: Optional[List[Dict]] = None +) -> Dict[str, Dict[str, float]]: + """ + Benchmark ReplayBuffer with different configurations. + + Args: + capacity: Buffer capacity + add_count: Number of transitions to add during benchmark + sample_count: Number of sampling operations to benchmark + batch_size: Batch size for sampling + image_shape: Shape of image observations + state_shape: Shape of vector observations + action_shape: Shape of actions + configs: List of configurations to benchmark + + Returns: + Dictionary with benchmark results + """ + if configs is None: + configs = [ + { + "name": "Pure GPU", + "device": "cuda:0", + "storage_device": "cuda:0", + "use_pinned_memory": False, + "async_transfer": False, + }, + { + "name": "CPU Storage + GPU Compute", + "device": "cuda:0", + "storage_device": "cpu", + "use_pinned_memory": True, + "async_transfer": True, + }, + { + "name": "CPU Storage + GPU Compute (No Pinned Memory)", + "device": "cuda:0", + "storage_device": "cpu", + "use_pinned_memory": False, + "async_transfer": False, + }, + { + "name": "Pure CPU", + "device": "cpu", + "storage_device": "cpu", + "use_pinned_memory": False, + "async_transfer": False, + } + ] + + results = defaultdict(dict) + + for config in configs: + if not torch.cuda.is_available() and "cuda" in config["device"]: + print(f"Skipping {config['name']} as CUDA is not available") + continue + + print(f"\nBenchmarking configuration: {config['name']}") + print(f" - Compute device: {config['device']}") + print(f" - Storage device: {config['storage_device']}") + print(f" - Pinned memory: {config['use_pinned_memory']}") + print(f" - Async transfer: {config['async_transfer']}") + + # Create buffer with this configuration + buffer = ReplayBuffer( + capacity=capacity, + device=config["device"], + storage_device=config["storage_device"], + use_pinned_memory=config["use_pinned_memory"], + async_transfer=config["async_transfer"], + optimize_memory=False # Keep simple for benchmarking + ) + + # Benchmark add operation + add_times = [] + for i in range(add_count): + # Generate random transition on the appropriate device + initial_device = "cuda:0" if config["device"] == "cuda:0" else "cpu" + state, action, reward, next_state, done, truncated = generate_random_transition( + image_shape=image_shape, + state_shape=state_shape, + action_shape=action_shape, + device=initial_device + ) + + # Measure add time + start_time = time.perf_counter() + buffer.add(state, action, reward, next_state, done, truncated) + end_time = time.perf_counter() + + add_times.append(end_time - start_time) + + # Print progress + if (i + 1) % (add_count // 10) == 0: + print(f" Added {i + 1}/{add_count} transitions") + + # Ensure buffer has enough samples for sampling benchmark + while len(buffer) < batch_size: + state, action, reward, next_state, done, truncated = generate_random_transition( + image_shape=image_shape, + state_shape=state_shape, + action_shape=action_shape, + device=initial_device + ) + buffer.add(state, action, reward, next_state, done, truncated) + + # Benchmark sample operation + sample_times = [] + for i in range(sample_count): + # Synchronize GPU before timing (for fair comparison) + if torch.cuda.is_available(): + torch.cuda.synchronize() + + start_time = time.perf_counter() + batch = buffer.sample(batch_size) + + # Ensure computation is complete before timing + if torch.cuda.is_available() and "cuda" in config["device"]: + torch.cuda.synchronize() + + end_time = time.perf_counter() + sample_times.append(end_time - start_time) + + # Print progress + if (i + 1) % (sample_count // 5) == 0: + print(f" Sampled {i + 1}/{sample_count} batches") + + # Record results + results[config["name"]]["add_avg_ms"] = np.mean(add_times[100:]) * 1000 # Skip first 100 for warmup + results[config["name"]]["add_min_ms"] = np.min(add_times[100:]) * 1000 + results[config["name"]]["add_max_ms"] = np.max(add_times[100:]) * 1000 + results[config["name"]]["sample_avg_ms"] = np.mean(sample_times) * 1000 + results[config["name"]]["sample_min_ms"] = np.min(sample_times) * 1000 + results[config["name"]]["sample_max_ms"] = np.max(sample_times) * 1000 + + # Force cleanup + del buffer + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return results + +def plot_results(results: Dict[str, Dict[str, float]], output_path: Optional[str] = None): + """Plot benchmark results.""" + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6)) + + # Extract data for plotting + configs = list(results.keys()) + add_times = [results[config]["add_avg_ms"] for config in configs] + add_mins = [results[config]["add_min_ms"] for config in configs] + add_maxs = [results[config]["add_max_ms"] for config in configs] + + sample_times = [results[config]["sample_avg_ms"] for config in configs] + sample_mins = [results[config]["sample_min_ms"] for config in configs] + sample_maxs = [results[config]["sample_max_ms"] for config in configs] + + # Add operation plot + bar_width = 0.5 + x = np.arange(len(configs)) + bars1 = ax1.bar(x, add_times, bar_width, label='Average', color='skyblue') + + # Add error bars + ax1.errorbar(x, add_times, yerr=[ + np.array(add_times) - np.array(add_mins), + np.array(add_maxs) - np.array(add_times) + ], fmt='none', ecolor='black', capsize=5) + + ax1.set_xlabel('Configuration') + ax1.set_ylabel('Time (ms)') + ax1.set_title('add() Operation Performance') + ax1.set_xticks(x) + ax1.set_xticklabels(configs, rotation=45, ha='right') + + # Annotate with values + for bar in bars1: + height = bar.get_height() + ax1.text(bar.get_x() + bar.get_width()/2., height + 0.1, + f'{height:.2f}ms', ha='center', va='bottom', rotation=0) + + # Sample operation plot + bars2 = ax2.bar(x, sample_times, bar_width, label='Average', color='lightgreen') + + # Add error bars + ax2.errorbar(x, sample_times, yerr=[ + np.array(sample_times) - np.array(sample_mins), + np.array(sample_maxs) - np.array(sample_times) + ], fmt='none', ecolor='black', capsize=5) + + ax2.set_xlabel('Configuration') + ax2.set_ylabel('Time (ms)') + ax2.set_title('sample(512) Operation Performance') + ax2.set_xticks(x) + ax2.set_xticklabels(configs, rotation=45, ha='right') + + # Annotate with values + for bar in bars2: + height = bar.get_height() + ax2.text(bar.get_x() + bar.get_width()/2., height + 0.1, + f'{height:.2f}ms', ha='center', va='bottom', rotation=0) + + plt.tight_layout() + + if output_path: + plt.savefig(output_path) + print(f"Results saved to {output_path}") + + plt.show() + +def print_results(results: Dict[str, Dict[str, float]]): + """Print benchmark results in a formatted table.""" + print("\n=== BENCHMARK RESULTS ===") + + # Header + print(f"{'Configuration':<40} | {'add() avg':<10} | {'add() min':<10} | {'add() max':<10} | " + f"{'sample() avg':<12} | {'sample() min':<12} | {'sample() max':<12}") + print("-" * 120) + + # Data rows + for config, metrics in results.items(): + print(f"{config:<40} | " + f"{metrics['add_avg_ms']:<10.2f} | " + f"{metrics['add_min_ms']:<10.2f} | " + f"{metrics['add_max_ms']:<10.2f} | " + f"{metrics['sample_avg_ms']:<12.2f} | " + f"{metrics['sample_min_ms']:<12.2f} | " + f"{metrics['sample_max_ms']:<12.2f}") + +def main(): + parser = argparse.ArgumentParser(description="Benchmark ReplayBuffer performance") + parser.add_argument("--capacity", type=int, default=50000, help="Buffer capacity") + parser.add_argument("--add-count", type=int, default=10000, help="Number of add operations to benchmark") + parser.add_argument("--sample-count", type=int, default=100, help="Number of sample operations to benchmark") + parser.add_argument("--batch-size", type=int, default=512, help="Batch size for sampling") + parser.add_argument("--output", type=str, default=None, help="Path to save the results plot") + + args = parser.parse_args() + + # Check if CUDA is available + if not torch.cuda.is_available(): + print("WARNING: CUDA is not available, only CPU benchmarks will be run") + else: + warm_up_gpu() + + # Run benchmark + results = benchmark_buffer( + capacity=args.capacity, + add_count=args.add_count, + sample_count=args.sample_count, + batch_size=args.batch_size + ) + + # Print and plot results + print_results(results) + plot_results(results, args.output) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/lerobot/scripts/server/load_dataset.py b/lerobot/scripts/server/load_dataset.py new file mode 100644 index 00000000..2db1beb8 --- /dev/null +++ b/lerobot/scripts/server/load_dataset.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import logging +from pathlib import Path + +import torch +from lerobot.common.datasets.lerobot_dataset import LeRobotDataset + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def main(): + # Initialize the dataset + dataset = LeRobotDataset( + repo_id="aractingi/pushcube_gamepad", + download_videos=True, # Set to False if you don't need video data + ) + + # Print dataset information + logger.info(f"Dataset loaded successfully!") + logger.info(f"Number of episodes: {dataset.num_episodes}") + logger.info(f"Number of frames: {dataset.num_frames}") + logger.info(f"FPS: {dataset.fps}") + logger.info(f"Features: {list(dataset.features.keys())}") + + # Get a sample frame + sample = dataset[0] + logger.info(f"\nSample frame keys: {list(sample.keys())}") + + # Print shapes of some key features + for key in ["observation.images.laptop", "observation.images.phone"]: + if key in sample: + logger.info(f"Shape of {key}: {sample[key].shape}") + + # Print task information + logger.info(f"\nTotal tasks: {dataset.meta.total_tasks}") + logger.info("Tasks:") + for task_idx, task in dataset.meta.tasks.items(): + logger.info(f" {task_idx}: {task}") + global_min = float("inf") + global_max = float("-inf") + for sample in dataset: + for k, v in sample.items(): + if isinstance(v, torch.Tensor): + v = v.to("cuda") + if k == "observation.state": + global_min = min(global_min, torch.min(v)) + global_max = max(global_max, torch.max(v)) + print(global_min, global_max) + + breakpoint() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/lerobot/scripts/server/load_dataset_and_buffer.py b/lerobot/scripts/server/load_dataset_and_buffer.py new file mode 100644 index 00000000..ff2959be --- /dev/null +++ b/lerobot/scripts/server/load_dataset_and_buffer.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +import logging +from pathlib import Path + +import torch +from lerobot.common.datasets.lerobot_dataset import LeRobotDataset +from lerobot.scripts.server.buffer import ReplayBuffer + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def main(): + # Initialize the dataset + logger.info("Loading LeRobotDataset...") + dataset = LeRobotDataset( + repo_id="aractingi/pushcube_gamepad", + download_videos=True, # Set to False if you don't need video data + ) + + # Print dataset information + logger.info(f"Dataset loaded successfully!") + logger.info(f"Number of episodes: {dataset.num_episodes}") + logger.info(f"Number of frames: {dataset.num_frames}") + logger.info(f"FPS: {dataset.fps}") + logger.info(f"Features: {list(dataset.features.keys())}") + + # Convert dataset to ReplayBuffer + logger.info("Converting dataset to ReplayBuffer...") + + # Define which keys from the dataset to use as state + # Get all observation keys from the first sample + sample = dataset[0] + state_keys = [key for key in sample.keys() if "observation" in key] + logger.info(f"Using observation keys: {state_keys}") + + # Create ReplayBuffer from the dataset + buffer = ReplayBuffer.from_lerobot_dataset( + lerobot_dataset=dataset, + device="cuda:0" if torch.cuda.is_available() else "cpu", + state_keys=state_keys, + capacity=None, # Use all data from the dataset + use_drq=True, + optimize_memory=False, + ) + + logger.info(f"ReplayBuffer created with {len(buffer)} transitions") + + # Sample from the buffer and display information + if len(buffer) > 0: + batch_size = min(5, len(buffer)) + logger.info(f"Sampling {batch_size} transitions from the buffer...") + + batch = buffer.sample(batch_size) + + logger.info(f"Batch keys: {list(batch.keys())}") + + # Print shapes of state tensors + logger.info("State shapes:") + for key, tensor in batch["state"].items(): + logger.info(f" {key}: {tensor.shape}") + + # Print action and reward information + logger.info(f"Action shape: {batch['action'].shape}") + logger.info(f"Reward shape: {batch['reward'].shape}") + logger.info(f"Sample rewards: {batch['reward']}") + +if __name__ == "__main__": + main() \ No newline at end of file