From 4fb01aef68d1de62a1ceb62eb9ee567262024959 Mon Sep 17 00:00:00 2001 From: ivelin Date: Mon, 23 Dec 2024 15:01:43 -0600 Subject: [PATCH 1/5] feat: smolvlm model with pusht train notebook Signed-off-by: ivelin --- .../common/policies/smolvlm/Smol_VLM_FT.ipynb | 413 +++++++++++++++ .../policies/smolvlm/Smol_VLM_lerobot.ipynb | 499 ++++++++++++++++++ .../common/policies/smolvlm/requirements.txt | 7 + 3 files changed, 919 insertions(+) create mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb create mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb create mode 100644 lerobot/common/policies/smolvlm/requirements.txt diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb new file mode 100644 index 00000000..93907ef1 --- /dev/null +++ b/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb @@ -0,0 +1,413 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nc0g2NLpUSGr" + }, + "source": [ + "# Fine-tune SmolVLM on Visual Question Answering using Consumer GPU with QLoRA\n", + "\n", + "In this notebook we will fine-tune SmolVLM VQAv2 dataset. With this notebook you can also fine-tune Idefics3, since both models have the same model class/architecture.\n", + "\n", + "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WIhA1lQ7j0kw" + }, + "outputs": [], + "source": [ + "!pip install -q accelerate datasets peft bitsandbytes tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XyJaqZZ3uYYl" + }, + "outputs": [], + "source": [ + "!pip install -q flash-attn --no-build-isolation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wAeMA0heVBjT" + }, + "source": [ + "We will push out model to Hub so we need to authenticate ourselves." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yKd5xtSGj7cm" + }, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WRq8ve-LVAzU" + }, + "source": [ + "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA`Β to False and `USE_LORA`Β to True." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "referenced_widgets": [ + "23d3d175e6e642c7abc2bce09b73cf4d", + "db6ca8f47f274464b135909c907c946a", + "d05822c6293c424fbf9df6ec0f6b532b", + "05582fca18f443d6965776a721e30e9f", + "3d8974fd1ba9415c8070c1eab8ad75cb", + "648257c1b1c24e25a26355bddf75aa41", + "afa9a31c6b7f45e082ae07dea4a2600e", + "92232af543a4446cac53e4fcf3f4b6e1", + "a5f06e59634f4edf9f3d9409846a2b31", + "7ddfa8718bc24882ba2b50a899656107", + "5983728a1c1e43edb4d16bee6ad40171", + "dff574197f1f4466abb0eb46d36b8378" + ] + }, + "id": "b9CDMq0duYYn", + "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04" + }, + "outputs": [], + "source": [ + "import torch\n", + "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n", + "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n", + "\n", + "USE_LORA = False\n", + "USE_QLORA = True\n", + "SMOL = True\n", + "\n", + "model_id = \"HuggingFaceTB/SmolVLM-Instruct\" if SMOL else \"HuggingFaceM4/Idefics3-8B-Llama3\"\n", + "\n", + "processor = AutoProcessor.from_pretrained(\n", + " model_id\n", + ")\n", + "\n", + "if USE_QLORA or USE_LORA:\n", + " lora_config = LoraConfig(\n", + " r=8,\n", + " lora_alpha=8,\n", + " lora_dropout=0.1,\n", + " target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n", + " use_dora=False if USE_QLORA else True,\n", + " init_lora_weights=\"gaussian\"\n", + " )\n", + " lora_config.inference_mode = False\n", + " if USE_QLORA:\n", + " bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16\n", + " )\n", + "\n", + " model = Idefics3ForConditionalGeneration.from_pretrained(\n", + " model_id,\n", + " quantization_config=bnb_config if USE_QLORA else None,\n", + " _attn_implementation=\"flash_attention_2\",\n", + " device_map=\"auto\"\n", + " )\n", + " model.add_adapter(lora_config)\n", + " model.enable_adapters()\n", + " model = prepare_model_for_kbit_training(model)\n", + " model = get_peft_model(model, lora_config)\n", + " print(model.get_nb_trainable_parameters())\n", + "else:\n", + " model = Idefics3ForConditionalGeneration.from_pretrained(\n", + " model_id,\n", + " torch_dtype=torch.bfloat16,\n", + " _attn_implementation=\"flash_attention_2\",\n", + " ).to(DEVICE)\n", + "\n", + " # if you'd like to only fine-tune LLM\n", + " for param in model.model.vision_model.parameters():\n", + " param.requires_grad = False" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WIVhpp0EyZO2" + }, + "source": [ + "The model as is is holding 2.7 GB of GPU RAM πŸ’—" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LMTtg3dl3NX2" + }, + "source": [ + "##Β Loading the dataset and Preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pWHMWTSZ3Pyr" + }, + "source": [ + "We will load a small portion of the VQAv2 dataset. We are loading a small portion of the model for education purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "POOqKqYRka5O" + }, + "outputs": [], + "source": [ + "from datasets import load_dataset\n", + "ds = load_dataset('merve/vqav2-small', trust_remote_code=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Znf9vMo5rnSd" + }, + "outputs": [], + "source": [ + "split_ds = ds[\"validation\"].train_test_split(test_size=0.5)\n", + "train_ds = split_ds[\"train\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FIDioFlRuYYn", + "outputId": "79b697a7-d245-4fdc-b0e8-d9ffa8627953" + }, + "outputs": [], + "source": [ + "train_ds" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5nwMO3n0X7Hv" + }, + "source": [ + "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e0krVLZ-wNMl" + }, + "outputs": [], + "source": [ + "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n", + " processor.tokenizer.additional_special_tokens.index(\"\")]\n", + "\n", + "def collate_fn(examples):\n", + " texts = []\n", + " images = []\n", + " for example in examples:\n", + " image = example[\"image\"]\n", + " if image.mode != 'RGB':\n", + " image = image.convert('RGB')\n", + " question = example[\"question\"]\n", + " answer = example[\"multiple_choice_answer\"]\n", + " messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n", + " {\"type\": \"image\"},\n", + " {\"type\": \"text\", \"text\": question}\n", + " ]\n", + " },\n", + " {\n", + " \"role\": \"assistant\",\n", + " \"content\": [\n", + " {\"type\": \"text\", \"text\": answer}\n", + " ]\n", + " }\n", + " ]\n", + " text = processor.apply_chat_template(messages, add_generation_prompt=False)\n", + " texts.append(text.strip())\n", + " images.append([image])\n", + "\n", + " batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n", + " labels = batch[\"input_ids\"].clone()\n", + " labels[labels == processor.tokenizer.pad_token_id] = -100\n", + " labels[labels == image_token_id] = -100\n", + " batch[\"labels\"] = labels\n", + "\n", + " return batch" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kEYDjWpE3LD5" + }, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QvAs896cdwg8" + }, + "source": [ + "We can now initialize `Trainer`Β and initialize `TrainingArguments`Β to pass to `Trainer`.\n", + "\n", + "Some notes:\n", + "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n", + "- We use gradient accumulation to simulate a larger batch size.\n", + "- We also save up on memory from intermediate activations by using gradient checkpointing.\n", + "\n", + "**Disclaimer:**\n", + "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n", + "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QNE2yWAYrAhD" + }, + "outputs": [], + "source": [ + "from transformers import TrainingArguments, Trainer\n", + "\n", + "model_name = model_id.split(\"/\")[-1]\n", + "\n", + "training_args = TrainingArguments(\n", + " num_train_epochs=1,\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=4,\n", + " warmup_steps=50,\n", + " learning_rate=1e-4,\n", + " weight_decay=0.01,\n", + " logging_steps=25,\n", + " save_strategy=\"steps\",\n", + " save_steps=250,\n", + " save_total_limit=1,\n", + " optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n", + " bf16=True, #Β underlying precision for 8bit\n", + " output_dir=f\"./{model_name}-vqav2\",\n", + " hub_model_id=f\"{model_name}-vqav2\",\n", + " report_to=\"wandb\",\n", + " remove_unused_columns=False,\n", + " gradient_checkpointing=True\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oBBSDpBhreJd", + "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4" + }, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=collate_fn,\n", + " train_dataset=train_ds,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_QOCpw_-uYYo", + "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea" + }, + "outputs": [], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0hN0QD9_uYYo" + }, + "outputs": [], + "source": [ + "trainer.push_to_hub()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "include_colab_link": true, + "name": "Smol_VLM_FT.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "zk0", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb new file mode 100644 index 00000000..84de0d93 --- /dev/null +++ b/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb @@ -0,0 +1,499 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "view-in-github" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nc0g2NLpUSGr" + }, + "source": [ + "# Fine-tune SmolVLM on LeRobot Dataset using Consumer GPU with QLoRA\n", + "\n", + "In this notebook we will fine-tune SmolVLM Instruct on LeRobot PushT dataset. It is based on the SmolVLM VQA2 fine tuning, which is in turn based on Idefics3 model class/architecture.\n", + "\n", + "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WIhA1lQ7j0kw" + }, + "outputs": [], + "source": [ + "!pip install -q accelerate datasets peft bitsandbytes tensorboard" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XyJaqZZ3uYYl" + }, + "outputs": [], + "source": [ + "!pip install -q flash-attn --no-build-isolation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wAeMA0heVBjT" + }, + "source": [ + "We will push out model to Hub so we need to authenticate ourselves." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yKd5xtSGj7cm" + }, + "outputs": [], + "source": [ + "from huggingface_hub import notebook_login\n", + "\n", + "notebook_login()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WRq8ve-LVAzU" + }, + "source": [ + "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA`Β to False and `USE_LORA`Β to True." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "referenced_widgets": [ + "23d3d175e6e642c7abc2bce09b73cf4d", + "db6ca8f47f274464b135909c907c946a", + "d05822c6293c424fbf9df6ec0f6b532b", + "05582fca18f443d6965776a721e30e9f", + "3d8974fd1ba9415c8070c1eab8ad75cb", + "648257c1b1c24e25a26355bddf75aa41", + "afa9a31c6b7f45e082ae07dea4a2600e", + "92232af543a4446cac53e4fcf3f4b6e1", + "a5f06e59634f4edf9f3d9409846a2b31", + "7ddfa8718bc24882ba2b50a899656107", + "5983728a1c1e43edb4d16bee6ad40171", + "dff574197f1f4466abb0eb46d36b8378" + ] + }, + "id": "b9CDMq0duYYn", + "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04" + }, + "outputs": [], + "source": [ + "import torch\n", + "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n", + "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n", + "\n", + "USE_LORA = False\n", + "USE_QLORA = True\n", + "\n", + "model_id = \"HuggingFaceTB/SmolVLM-Instruct\"\n", + "\n", + "processor = AutoProcessor.from_pretrained(\n", + " model_id\n", + ")\n", + "\n", + "if USE_QLORA or USE_LORA:\n", + " lora_config = LoraConfig(\n", + " r=8,\n", + " lora_alpha=8,\n", + " lora_dropout=0.1,\n", + " target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n", + " use_dora=False if USE_QLORA else True,\n", + " init_lora_weights=\"gaussian\"\n", + " )\n", + " lora_config.inference_mode = False\n", + " if USE_QLORA:\n", + " bnb_config = BitsAndBytesConfig(\n", + " load_in_4bit=True,\n", + " bnb_4bit_use_double_quant=True,\n", + " bnb_4bit_quant_type=\"nf4\",\n", + " bnb_4bit_compute_dtype=torch.bfloat16\n", + " )\n", + "\n", + " model = Idefics3ForConditionalGeneration.from_pretrained(\n", + " model_id,\n", + " quantization_config=bnb_config if USE_QLORA else None,\n", + " _attn_implementation=\"flash_attention_2\",\n", + " device_map=\"auto\"\n", + " )\n", + " model.add_adapter(lora_config)\n", + " model.enable_adapters()\n", + " model = prepare_model_for_kbit_training(model)\n", + " model = get_peft_model(model, lora_config)\n", + " print(model.get_nb_trainable_parameters())\n", + "else:\n", + " model = Idefics3ForConditionalGeneration.from_pretrained(\n", + " model_id,\n", + " torch_dtype=torch.bfloat16,\n", + " _attn_implementation=\"flash_attention_2\",\n", + " device_map=\"auto\" \n", + " )\n", + "\n", + " # if you'd like to only fine-tune LLM\n", + " for param in model.model.vision_model.parameters():\n", + " param.requires_grad = False" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WIVhpp0EyZO2" + }, + "source": [ + "The model as is is holding 2.7 GB of GPU RAM πŸ’—" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LMTtg3dl3NX2" + }, + "source": [ + "##Β Loading the dataset and Preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pWHMWTSZ3Pyr" + }, + "source": [ + "We will load a small portion of the PushT dataset. We are loading a small portion of the model for education purposes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from pathlib import Path\n", + "\n", + "from lerobot.common.datasets.lerobot_dataset import LeRobotDataset\n", + "from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig\n", + "from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy\n", + "\n", + "# Create a directory to store the training checkpoint.\n", + "output_directory = Path(\"outputs/train/example_pusht_diffusion\")\n", + "output_directory.mkdir(parents=True, exist_ok=True)\n", + "\n", + "# Number of offline training steps (we'll only do offline training for this example.)\n", + "# Adjust as you prefer. 5000 steps are needed to get something worth evaluating.\n", + "training_steps = 5000\n", + "device = torch.device(\"cuda\")\n", + "log_freq = 250\n", + "\n", + "# Set up the dataset.\n", + "delta_timestamps = {\n", + " # Load the previous image and state at -0.1 seconds before current frame,\n", + " # then load current image and state corresponding to 0.0 second.\n", + " \"observation.image\": [-0.1, 0.0],\n", + " \"observation.state\": [-0.1, 0.0],\n", + " # Load the previous action (-0.1), the next action to be executed (0.0),\n", + " # and 14 future actions with a 0.1 seconds spacing. All these actions will be\n", + " # used to supervise the policy.\n", + " \"action\": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],\n", + "}\n", + "dataset = LeRobotDataset(\"lerobot/pusht\", delta_timestamps=delta_timestamps)\n", + "\n", + "# Set up the the policy.\n", + "# Policies are initialized with a configuration class, in this case `DiffusionConfig`.\n", + "# For this example, no arguments need to be passed because the defaults are set up for PushT.\n", + "# If you're doing something different, you will likely need to change at least some of the defaults.\n", + "# cfg = DiffusionConfig()\n", + "# policy = DiffusionPolicy(cfg, dataset_stats=dataset.meta.stats)\n", + "# policy.train()\n", + "# policy.to(device)\n", + "# optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)\n", + "\n", + "# Create dataloader for offline training.\n", + "dataloader = torch.utils.data.DataLoader(\n", + " dataset,\n", + " num_workers=4,\n", + " batch_size=64,\n", + " shuffle=True,\n", + " pin_memory=device != torch.device(\"cpu\"),\n", + " drop_last=True,\n", + ")\n", + "\n", + "# Since pusht is evaluated via gym, we will use all pre-recorded data for training\n", + "# and generate the test/eval data later\n", + "\n", + "train_ds = dataloader" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5nwMO3n0X7Hv" + }, + "source": [ + "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e0krVLZ-wNMl" + }, + "outputs": [], + "source": [ + "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n", + " processor.tokenizer.additional_special_tokens.index(\"\")]\n", + "\n", + "def collate_fn(examples):\n", + " \"\"\"\n", + " Transform lerobot v2.0 formatted pusht dataset samples to smolvlm input format.\n", + " pusht dataset format spec: \n", + " https://huggingface.co/datasets/lerobot/pusht/blob/main/meta/info.json\n", + " smolvlm dataset format spec: \n", + " https://huggingface.co/datasets/merve/vqav2-small based on https://huggingface.co/merve/idefics3llama-vqav2\n", + " \"\"\"\n", + " tasks = task = dataset.meta[\"tasks\"]\n", + " texts = []\n", + " images = []\n", + " for example in examples:\n", + " # image = example[\"image\"]\n", + " # if image.mode != 'RGB':\n", + " # image = image.convert('RGB')\n", + " # question = example[\"question\"]\n", + " # answer = example[\"multiple_choice_answer\"]\n", + " observation_image = example[\"observation.image\"]\n", + " observation_state = example[\"observation.state\"]\n", + " action = example[\"action\"]\n", + " timestamp = example[\"timestamp\"]\n", + " task = tasks[example[\"task_index\"]]\n", + "\n", + " messages = [\n", + " {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " # {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n", + " # {\"type\": \"text\", \"text\": question},\n", + " {\"type\": \"text\", \"text\": \"Answer with a sequence of up to 14 robot action states in json format.\"},\n", + " {\"type\": \"text\", \"text\": task},\n", + " {\"type\": \"image\"},\n", + " {\"type\": \"text\", \"text\": previous_action_state},\n", + " {\"type\": \"image\"},\n", + " {\"type\": \"text\", \"text\": current_action_state},\n", + " \n", + " ]\n", + " },\n", + " {\n", + " \"role\": \"assistant\",\n", + " \"content\": [\n", + " # {\"type\": \"text\", \"text\": answer}\n", + " {\"type\": \"text\", \"text\": predicted_action_state_sequence}\n", + " ]\n", + " }\n", + " ]\n", + " text = processor.apply_chat_template(messages, add_generation_prompt=False)\n", + " texts.append(text.strip())\n", + " images.append([image])\n", + "\n", + " batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n", + " labels = batch[\"input_ids\"].clone()\n", + " labels[labels == processor.tokenizer.pad_token_id] = -100\n", + " labels[labels == image_token_id] = -100\n", + " batch[\"labels\"] = labels\n", + "\n", + " return batch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kEYDjWpE3LD5" + }, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "QvAs896cdwg8" + }, + "source": [ + "We can now initialize `Trainer`Β and initialize `TrainingArguments`Β to pass to `Trainer`.\n", + "\n", + "Some notes:\n", + "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n", + "- We use gradient accumulation to simulate a larger batch size.\n", + "- We also save up on memory from intermediate activations by using gradient checkpointing.\n", + "\n", + "**Disclaimer:**\n", + "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n", + "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "QNE2yWAYrAhD" + }, + "outputs": [], + "source": [ + "from transformers import TrainingArguments, Trainer\n", + "\n", + "model_name = \"smolvlm-lerobot\"\n", + "\n", + "training_args = TrainingArguments(\n", + " num_train_epochs=1,\n", + " per_device_train_batch_size=4,\n", + " gradient_accumulation_steps=4,\n", + " warmup_steps=50,\n", + " learning_rate=1e-4,\n", + " weight_decay=0.01,\n", + " logging_steps=25,\n", + " save_strategy=\"steps\",\n", + " save_steps=250,\n", + " save_total_limit=1,\n", + " optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n", + " bf16=True, #Β underlying precision for 8bit\n", + " output_dir=f\"./{model_name}-pusht\",\n", + " hub_model_id=f\"{model_name}-pusht\",\n", + " report_to=\"wandb\",\n", + " remove_unused_columns=False,\n", + " gradient_checkpointing=True\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "oBBSDpBhreJd", + "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4" + }, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " data_collator=collate_fn,\n", + " train_dataset=train_ds,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_QOCpw_-uYYo", + "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea" + }, + "outputs": [], + "source": [ + "trainer.train()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0hN0QD9_uYYo" + }, + "outputs": [], + "source": [ + "trainer.push_to_hub()" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "A100", + "include_colab_link": true, + "name": "Smol_VLM_FT.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "zk0", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/lerobot/common/policies/smolvlm/requirements.txt b/lerobot/common/policies/smolvlm/requirements.txt new file mode 100644 index 00000000..f4153b89 --- /dev/null +++ b/lerobot/common/policies/smolvlm/requirements.txt @@ -0,0 +1,7 @@ +transformers +trl +peft +accelerate +datasets +wandb +bitsandbytes \ No newline at end of file From 0825e2e14c5937f5f98ae2a60bb339317cfb5ec9 Mon Sep 17 00:00:00 2001 From: ivelin Date: Mon, 27 Jan 2025 14:54:44 -0600 Subject: [PATCH 2/5] feat: add scan_motors.py tool for troubleshooting Signed-off-by: ivelin --- .gitignore | 3 ++ examples/10_use_so100.md | 46 ++++++++++++++++ lerobot/scripts/scan_motors.py | 97 ++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+) create mode 100644 lerobot/scripts/scan_motors.py diff --git a/.gitignore b/.gitignore index 0e203a39..79fdc517 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,6 @@ dmypy.json # Cython debug symbols cython_debug/ + +# lerobot calibration cache +.cache diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md index 155bbe51..57fa832d 100644 --- a/examples/10_use_so100.md +++ b/examples/10_use_so100.md @@ -140,10 +140,54 @@ Try to avoid rotating the motor while doing so to keep position 2048 set during Follow step 4 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=610). The first arm should take a bit more than 1 hour to assemble, but once you get use to it, you can do it under 1 hour for the second arm. + +### c. Troubleshooting + +Sometimes during assembly, the cables connecting the motors or the power adapter cable may be accidentally disconnected. To be sure that the motors are properly connected and functioning after assembly, use the scan_motors tool to test each arm. +The output should look similar to the example below. If the list of motor IDs is shorter than 6, there is probably a poorly connected cable or a motor is misconfigured. + +```bash +lerobot$ python lerobot/scripts/scan_motors.py --port /dev/ttyACM0 --brand feetech --model sts3215 +Connected on port /dev/ttyACM0 +Scanning all baudrates and motor indices +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 43.11it/s] +6 motor ID(s) detected for baud rate 1000000. Motor IDs: [1, 2, 3, 4, 5, 6]. +Present Position 1997 +Offset 0 +Present Position 2098 +Offset 0 +Present Position 1124 +Offset 0 +Present Position 253 +Offset 0 +Present Position 3769 +Offset 0 +Present Position 926 +Offset 0 +Setting bus baud rate to 128000. Previously 1000000. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 14.42it/s] +Setting bus baud rate to 500000. Previously 128000. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 14.50it/s] +Setting bus baud rate to 115200. Previously 500000. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 14.40it/s] +Setting bus baud rate to 57600. Previously 115200. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 14.08it/s] +Setting bus baud rate to 38400. Previously 57600. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 13.88it/s] +Setting bus baud rate to 19200. Previously 38400. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 13.24it/s] +Setting bus baud rate to 250000. Previously 19200. +100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 9/9 [00:00<00:00, 14.47it/s] +Scan finished. +Disconnected from motor bus. +``` + + ## E. Calibrate Next, you'll need to calibrate your SO-100 robot to ensure that the leader and follower arms have the same position values when they are in the same physical position. This calibration is essential because it allows a neural network trained on one SO-100 robot to work on another. + #### a. Manual calibration of follower arm /!\ Contrarily to step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the auto calibration, we will actually do manual calibration of follower for now. @@ -160,6 +204,7 @@ python lerobot/scripts/control_robot.py calibrate \ --robot-overrides '~cameras' --arms main_follower ``` + #### b. Manual calibration of leader arm Follow step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the manual calibration. You will need to move the leader arm to these positions sequentially: @@ -174,6 +219,7 @@ python lerobot/scripts/control_robot.py calibrate \ --robot-overrides '~cameras' --arms main_leader ``` + ## F. Teleoperate **Simple teleop** diff --git a/lerobot/scripts/scan_motors.py b/lerobot/scripts/scan_motors.py new file mode 100644 index 00000000..498bf8e5 --- /dev/null +++ b/lerobot/scripts/scan_motors.py @@ -0,0 +1,97 @@ +""" +This script is helpful to diagnose issues with motors during calibration or later operation. +It scans all motors on a given port and displays their settings. + +Example of usage: +```bash +python lerobot/scripts/scan_motors.py \ + --port /dev/tty.usbmodem585A0080521 \ + --brand feetech \ + --model sts3215 \ +``` +""" + +import argparse +import time + + +def scan_motors(port, brand, model): + if brand == "feetech": + from lerobot.common.robot_devices.motors.feetech import MODEL_BAUDRATE_TABLE + from lerobot.common.robot_devices.motors.feetech import ( + SCS_SERIES_BAUDRATE_TABLE as SERIES_BAUDRATE_TABLE, + ) + from lerobot.common.robot_devices.motors.feetech import FeetechMotorsBus as MotorsBusClass + elif brand == "dynamixel": + from lerobot.common.robot_devices.motors.dynamixel import MODEL_BAUDRATE_TABLE + from lerobot.common.robot_devices.motors.dynamixel import ( + X_SERIES_BAUDRATE_TABLE as SERIES_BAUDRATE_TABLE, + ) + from lerobot.common.robot_devices.motors.dynamixel import DynamixelMotorsBus as MotorsBusClass + else: + raise ValueError( + f"Currently we do not support this motor brand: {brand}. We currently support feetech and dynamixel motors." + ) + + # Check if the provided model exists in the model_baud_rate_table + if model not in MODEL_BAUDRATE_TABLE: + raise ValueError( + f"Invalid model '{model}' for brand '{brand}'. Supported models: {list(MODEL_BAUDRATE_TABLE.keys())}" + ) + + # Setup motor names, indices, and models + motor_name = "motor" + motor_index_arbitrary = -1 # Use an arbitrary out of range motor ID + motor_model = model # Use the motor model passed via argument + + # Initialize the MotorBus with the correct port and motor configurations + motor_bus = MotorsBusClass(port=port, motors={motor_name: (motor_index_arbitrary, motor_model)}) + + # Try to connect to the motor bus and handle any connection-specific errors + try: + motor_bus.connect() + print(f"Connected on port {motor_bus.port}") + except OSError as e: + print(f"Error occurred when connecting to the motor bus: {e}") + return + + # Motor bus is connected, proceed with the rest of the operations + try: + print("Scanning all baudrates and motor indices") + all_baudrates = set(SERIES_BAUDRATE_TABLE.values()) + motors_detected = False + + for baudrate in all_baudrates: + motor_bus.set_bus_baudrate(baudrate) + present_ids = motor_bus.find_motor_indices(list(range(1, 10))) + if len(present_ids) > 0: + print(f"{len(present_ids)} motor ID(s) detected for baud rate {baudrate}. Motor IDs: {present_ids}.") + motors_detected = True + + for motor_idx in present_ids: + present_idx = motor_bus.read_with_motor_ids(motor_bus.motor_models, motor_idx, "ID", num_retry=2) + if present_idx != motor_idx: + raise OSError(f"Failed to access motor index {motor_idx}.") + + if brand == "feetech": + print("Present Position", motor_bus.read_with_motor_ids(motor_bus.motor_models, motor_idx, "Present_Position", num_retry=2)) + print("Offset", motor_bus.read_with_motor_ids(motor_bus.motor_models, motor_idx, "Offset", num_retry=2)) + + if not motors_detected: + print("No motors detected.") + + print("Scan finished.") + + finally: + motor_bus.disconnect() + print("Disconnected from motor bus.") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--port", type=str, required=True, help="Motors bus port (e.g. dynamixel,feetech)") + parser.add_argument("--brand", type=str, required=True, help="Motor brand (e.g. dynamixel,feetech)") + parser.add_argument("--model", type=str, required=True, help="Motor model (e.g. xl330-m077,sts3215)") + args = parser.parse_args() + + scan_motors(args.port, args.brand, args.model) From 843c0996976d3f8f3f086bedfb38c072a73ec643 Mon Sep 17 00:00:00 2001 From: ivelin Date: Mon, 27 Jan 2025 15:09:51 -0600 Subject: [PATCH 3/5] fix: remove work in progress files from branch Signed-off-by: ivelin --- .../common/policies/smolvlm/Smol_VLM_FT.ipynb | 413 --------------- .../policies/smolvlm/Smol_VLM_lerobot.ipynb | 499 ------------------ .../common/policies/smolvlm/requirements.txt | 7 - 3 files changed, 919 deletions(-) delete mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb delete mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb delete mode 100644 lerobot/common/policies/smolvlm/requirements.txt diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb deleted file mode 100644 index 93907ef1..00000000 --- a/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb +++ /dev/null @@ -1,413 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nc0g2NLpUSGr" - }, - "source": [ - "# Fine-tune SmolVLM on Visual Question Answering using Consumer GPU with QLoRA\n", - "\n", - "In this notebook we will fine-tune SmolVLM VQAv2 dataset. With this notebook you can also fine-tune Idefics3, since both models have the same model class/architecture.\n", - "\n", - "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WIhA1lQ7j0kw" - }, - "outputs": [], - "source": [ - "!pip install -q accelerate datasets peft bitsandbytes tensorboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XyJaqZZ3uYYl" - }, - "outputs": [], - "source": [ - "!pip install -q flash-attn --no-build-isolation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wAeMA0heVBjT" - }, - "source": [ - "We will push out model to Hub so we need to authenticate ourselves." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yKd5xtSGj7cm" - }, - "outputs": [], - "source": [ - "from huggingface_hub import notebook_login\n", - "\n", - "notebook_login()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WRq8ve-LVAzU" - }, - "source": [ - "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA`Β to False and `USE_LORA`Β to True." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "referenced_widgets": [ - "23d3d175e6e642c7abc2bce09b73cf4d", - "db6ca8f47f274464b135909c907c946a", - "d05822c6293c424fbf9df6ec0f6b532b", - "05582fca18f443d6965776a721e30e9f", - "3d8974fd1ba9415c8070c1eab8ad75cb", - "648257c1b1c24e25a26355bddf75aa41", - "afa9a31c6b7f45e082ae07dea4a2600e", - "92232af543a4446cac53e4fcf3f4b6e1", - "a5f06e59634f4edf9f3d9409846a2b31", - "7ddfa8718bc24882ba2b50a899656107", - "5983728a1c1e43edb4d16bee6ad40171", - "dff574197f1f4466abb0eb46d36b8378" - ] - }, - "id": "b9CDMq0duYYn", - "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04" - }, - "outputs": [], - "source": [ - "import torch\n", - "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n", - "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n", - "\n", - "USE_LORA = False\n", - "USE_QLORA = True\n", - "SMOL = True\n", - "\n", - "model_id = \"HuggingFaceTB/SmolVLM-Instruct\" if SMOL else \"HuggingFaceM4/Idefics3-8B-Llama3\"\n", - "\n", - "processor = AutoProcessor.from_pretrained(\n", - " model_id\n", - ")\n", - "\n", - "if USE_QLORA or USE_LORA:\n", - " lora_config = LoraConfig(\n", - " r=8,\n", - " lora_alpha=8,\n", - " lora_dropout=0.1,\n", - " target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n", - " use_dora=False if USE_QLORA else True,\n", - " init_lora_weights=\"gaussian\"\n", - " )\n", - " lora_config.inference_mode = False\n", - " if USE_QLORA:\n", - " bnb_config = BitsAndBytesConfig(\n", - " load_in_4bit=True,\n", - " bnb_4bit_use_double_quant=True,\n", - " bnb_4bit_quant_type=\"nf4\",\n", - " bnb_4bit_compute_dtype=torch.bfloat16\n", - " )\n", - "\n", - " model = Idefics3ForConditionalGeneration.from_pretrained(\n", - " model_id,\n", - " quantization_config=bnb_config if USE_QLORA else None,\n", - " _attn_implementation=\"flash_attention_2\",\n", - " device_map=\"auto\"\n", - " )\n", - " model.add_adapter(lora_config)\n", - " model.enable_adapters()\n", - " model = prepare_model_for_kbit_training(model)\n", - " model = get_peft_model(model, lora_config)\n", - " print(model.get_nb_trainable_parameters())\n", - "else:\n", - " model = Idefics3ForConditionalGeneration.from_pretrained(\n", - " model_id,\n", - " torch_dtype=torch.bfloat16,\n", - " _attn_implementation=\"flash_attention_2\",\n", - " ).to(DEVICE)\n", - "\n", - " # if you'd like to only fine-tune LLM\n", - " for param in model.model.vision_model.parameters():\n", - " param.requires_grad = False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WIVhpp0EyZO2" - }, - "source": [ - "The model as is is holding 2.7 GB of GPU RAM πŸ’—" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LMTtg3dl3NX2" - }, - "source": [ - "##Β Loading the dataset and Preprocessing" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pWHMWTSZ3Pyr" - }, - "source": [ - "We will load a small portion of the VQAv2 dataset. We are loading a small portion of the model for education purposes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "POOqKqYRka5O" - }, - "outputs": [], - "source": [ - "from datasets import load_dataset\n", - "ds = load_dataset('merve/vqav2-small', trust_remote_code=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Znf9vMo5rnSd" - }, - "outputs": [], - "source": [ - "split_ds = ds[\"validation\"].train_test_split(test_size=0.5)\n", - "train_ds = split_ds[\"train\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FIDioFlRuYYn", - "outputId": "79b697a7-d245-4fdc-b0e8-d9ffa8627953" - }, - "outputs": [], - "source": [ - "train_ds" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5nwMO3n0X7Hv" - }, - "source": [ - "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e0krVLZ-wNMl" - }, - "outputs": [], - "source": [ - "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n", - " processor.tokenizer.additional_special_tokens.index(\"\")]\n", - "\n", - "def collate_fn(examples):\n", - " texts = []\n", - " images = []\n", - " for example in examples:\n", - " image = example[\"image\"]\n", - " if image.mode != 'RGB':\n", - " image = image.convert('RGB')\n", - " question = example[\"question\"]\n", - " answer = example[\"multiple_choice_answer\"]\n", - " messages = [\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n", - " {\"type\": \"image\"},\n", - " {\"type\": \"text\", \"text\": question}\n", - " ]\n", - " },\n", - " {\n", - " \"role\": \"assistant\",\n", - " \"content\": [\n", - " {\"type\": \"text\", \"text\": answer}\n", - " ]\n", - " }\n", - " ]\n", - " text = processor.apply_chat_template(messages, add_generation_prompt=False)\n", - " texts.append(text.strip())\n", - " images.append([image])\n", - "\n", - " batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n", - " labels = batch[\"input_ids\"].clone()\n", - " labels[labels == processor.tokenizer.pad_token_id] = -100\n", - " labels[labels == image_token_id] = -100\n", - " batch[\"labels\"] = labels\n", - "\n", - " return batch" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kEYDjWpE3LD5" - }, - "source": [ - "## Training" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QvAs896cdwg8" - }, - "source": [ - "We can now initialize `Trainer`Β and initialize `TrainingArguments`Β to pass to `Trainer`.\n", - "\n", - "Some notes:\n", - "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n", - "- We use gradient accumulation to simulate a larger batch size.\n", - "- We also save up on memory from intermediate activations by using gradient checkpointing.\n", - "\n", - "**Disclaimer:**\n", - "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n", - "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QNE2yWAYrAhD" - }, - "outputs": [], - "source": [ - "from transformers import TrainingArguments, Trainer\n", - "\n", - "model_name = model_id.split(\"/\")[-1]\n", - "\n", - "training_args = TrainingArguments(\n", - " num_train_epochs=1,\n", - " per_device_train_batch_size=4,\n", - " gradient_accumulation_steps=4,\n", - " warmup_steps=50,\n", - " learning_rate=1e-4,\n", - " weight_decay=0.01,\n", - " logging_steps=25,\n", - " save_strategy=\"steps\",\n", - " save_steps=250,\n", - " save_total_limit=1,\n", - " optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n", - " bf16=True, #Β underlying precision for 8bit\n", - " output_dir=f\"./{model_name}-vqav2\",\n", - " hub_model_id=f\"{model_name}-vqav2\",\n", - " report_to=\"wandb\",\n", - " remove_unused_columns=False,\n", - " gradient_checkpointing=True\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oBBSDpBhreJd", - "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4" - }, - "outputs": [], - "source": [ - "trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " data_collator=collate_fn,\n", - " train_dataset=train_ds,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_QOCpw_-uYYo", - "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea" - }, - "outputs": [], - "source": [ - "trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0hN0QD9_uYYo" - }, - "outputs": [], - "source": [ - "trainer.push_to_hub()" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "A100", - "include_colab_link": true, - "name": "Smol_VLM_FT.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "zk0", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb deleted file mode 100644 index 84de0d93..00000000 --- a/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb +++ /dev/null @@ -1,499 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nc0g2NLpUSGr" - }, - "source": [ - "# Fine-tune SmolVLM on LeRobot Dataset using Consumer GPU with QLoRA\n", - "\n", - "In this notebook we will fine-tune SmolVLM Instruct on LeRobot PushT dataset. It is based on the SmolVLM VQA2 fine tuning, which is in turn based on Idefics3 model class/architecture.\n", - "\n", - "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WIhA1lQ7j0kw" - }, - "outputs": [], - "source": [ - "!pip install -q accelerate datasets peft bitsandbytes tensorboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XyJaqZZ3uYYl" - }, - "outputs": [], - "source": [ - "!pip install -q flash-attn --no-build-isolation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wAeMA0heVBjT" - }, - "source": [ - "We will push out model to Hub so we need to authenticate ourselves." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yKd5xtSGj7cm" - }, - "outputs": [], - "source": [ - "from huggingface_hub import notebook_login\n", - "\n", - "notebook_login()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WRq8ve-LVAzU" - }, - "source": [ - "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA`Β to False and `USE_LORA`Β to True." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "referenced_widgets": [ - "23d3d175e6e642c7abc2bce09b73cf4d", - "db6ca8f47f274464b135909c907c946a", - "d05822c6293c424fbf9df6ec0f6b532b", - "05582fca18f443d6965776a721e30e9f", - "3d8974fd1ba9415c8070c1eab8ad75cb", - "648257c1b1c24e25a26355bddf75aa41", - "afa9a31c6b7f45e082ae07dea4a2600e", - "92232af543a4446cac53e4fcf3f4b6e1", - "a5f06e59634f4edf9f3d9409846a2b31", - "7ddfa8718bc24882ba2b50a899656107", - "5983728a1c1e43edb4d16bee6ad40171", - "dff574197f1f4466abb0eb46d36b8378" - ] - }, - "id": "b9CDMq0duYYn", - "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04" - }, - "outputs": [], - "source": [ - "import torch\n", - "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n", - "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n", - "\n", - "USE_LORA = False\n", - "USE_QLORA = True\n", - "\n", - "model_id = \"HuggingFaceTB/SmolVLM-Instruct\"\n", - "\n", - "processor = AutoProcessor.from_pretrained(\n", - " model_id\n", - ")\n", - "\n", - "if USE_QLORA or USE_LORA:\n", - " lora_config = LoraConfig(\n", - " r=8,\n", - " lora_alpha=8,\n", - " lora_dropout=0.1,\n", - " target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n", - " use_dora=False if USE_QLORA else True,\n", - " init_lora_weights=\"gaussian\"\n", - " )\n", - " lora_config.inference_mode = False\n", - " if USE_QLORA:\n", - " bnb_config = BitsAndBytesConfig(\n", - " load_in_4bit=True,\n", - " bnb_4bit_use_double_quant=True,\n", - " bnb_4bit_quant_type=\"nf4\",\n", - " bnb_4bit_compute_dtype=torch.bfloat16\n", - " )\n", - "\n", - " model = Idefics3ForConditionalGeneration.from_pretrained(\n", - " model_id,\n", - " quantization_config=bnb_config if USE_QLORA else None,\n", - " _attn_implementation=\"flash_attention_2\",\n", - " device_map=\"auto\"\n", - " )\n", - " model.add_adapter(lora_config)\n", - " model.enable_adapters()\n", - " model = prepare_model_for_kbit_training(model)\n", - " model = get_peft_model(model, lora_config)\n", - " print(model.get_nb_trainable_parameters())\n", - "else:\n", - " model = Idefics3ForConditionalGeneration.from_pretrained(\n", - " model_id,\n", - " torch_dtype=torch.bfloat16,\n", - " _attn_implementation=\"flash_attention_2\",\n", - " device_map=\"auto\" \n", - " )\n", - "\n", - " # if you'd like to only fine-tune LLM\n", - " for param in model.model.vision_model.parameters():\n", - " param.requires_grad = False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "WIVhpp0EyZO2" - }, - "source": [ - "The model as is is holding 2.7 GB of GPU RAM πŸ’—" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "LMTtg3dl3NX2" - }, - "source": [ - "##Β Loading the dataset and Preprocessing" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pWHMWTSZ3Pyr" - }, - "source": [ - "We will load a small portion of the PushT dataset. We are loading a small portion of the model for education purposes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "from pathlib import Path\n", - "\n", - "from lerobot.common.datasets.lerobot_dataset import LeRobotDataset\n", - "from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig\n", - "from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy\n", - "\n", - "# Create a directory to store the training checkpoint.\n", - "output_directory = Path(\"outputs/train/example_pusht_diffusion\")\n", - "output_directory.mkdir(parents=True, exist_ok=True)\n", - "\n", - "# Number of offline training steps (we'll only do offline training for this example.)\n", - "# Adjust as you prefer. 5000 steps are needed to get something worth evaluating.\n", - "training_steps = 5000\n", - "device = torch.device(\"cuda\")\n", - "log_freq = 250\n", - "\n", - "# Set up the dataset.\n", - "delta_timestamps = {\n", - " # Load the previous image and state at -0.1 seconds before current frame,\n", - " # then load current image and state corresponding to 0.0 second.\n", - " \"observation.image\": [-0.1, 0.0],\n", - " \"observation.state\": [-0.1, 0.0],\n", - " # Load the previous action (-0.1), the next action to be executed (0.0),\n", - " # and 14 future actions with a 0.1 seconds spacing. All these actions will be\n", - " # used to supervise the policy.\n", - " \"action\": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],\n", - "}\n", - "dataset = LeRobotDataset(\"lerobot/pusht\", delta_timestamps=delta_timestamps)\n", - "\n", - "# Set up the the policy.\n", - "# Policies are initialized with a configuration class, in this case `DiffusionConfig`.\n", - "# For this example, no arguments need to be passed because the defaults are set up for PushT.\n", - "# If you're doing something different, you will likely need to change at least some of the defaults.\n", - "# cfg = DiffusionConfig()\n", - "# policy = DiffusionPolicy(cfg, dataset_stats=dataset.meta.stats)\n", - "# policy.train()\n", - "# policy.to(device)\n", - "# optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)\n", - "\n", - "# Create dataloader for offline training.\n", - "dataloader = torch.utils.data.DataLoader(\n", - " dataset,\n", - " num_workers=4,\n", - " batch_size=64,\n", - " shuffle=True,\n", - " pin_memory=device != torch.device(\"cpu\"),\n", - " drop_last=True,\n", - ")\n", - "\n", - "# Since pusht is evaluated via gym, we will use all pre-recorded data for training\n", - "# and generate the test/eval data later\n", - "\n", - "train_ds = dataloader" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5nwMO3n0X7Hv" - }, - "source": [ - "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e0krVLZ-wNMl" - }, - "outputs": [], - "source": [ - "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n", - " processor.tokenizer.additional_special_tokens.index(\"\")]\n", - "\n", - "def collate_fn(examples):\n", - " \"\"\"\n", - " Transform lerobot v2.0 formatted pusht dataset samples to smolvlm input format.\n", - " pusht dataset format spec: \n", - " https://huggingface.co/datasets/lerobot/pusht/blob/main/meta/info.json\n", - " smolvlm dataset format spec: \n", - " https://huggingface.co/datasets/merve/vqav2-small based on https://huggingface.co/merve/idefics3llama-vqav2\n", - " \"\"\"\n", - " tasks = task = dataset.meta[\"tasks\"]\n", - " texts = []\n", - " images = []\n", - " for example in examples:\n", - " # image = example[\"image\"]\n", - " # if image.mode != 'RGB':\n", - " # image = image.convert('RGB')\n", - " # question = example[\"question\"]\n", - " # answer = example[\"multiple_choice_answer\"]\n", - " observation_image = example[\"observation.image\"]\n", - " observation_state = example[\"observation.state\"]\n", - " action = example[\"action\"]\n", - " timestamp = example[\"timestamp\"]\n", - " task = tasks[example[\"task_index\"]]\n", - "\n", - " messages = [\n", - " {\n", - " \"role\": \"user\",\n", - " \"content\": [\n", - " # {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n", - " # {\"type\": \"text\", \"text\": question},\n", - " {\"type\": \"text\", \"text\": \"Answer with a sequence of up to 14 robot action states in json format.\"},\n", - " {\"type\": \"text\", \"text\": task},\n", - " {\"type\": \"image\"},\n", - " {\"type\": \"text\", \"text\": previous_action_state},\n", - " {\"type\": \"image\"},\n", - " {\"type\": \"text\", \"text\": current_action_state},\n", - " \n", - " ]\n", - " },\n", - " {\n", - " \"role\": \"assistant\",\n", - " \"content\": [\n", - " # {\"type\": \"text\", \"text\": answer}\n", - " {\"type\": \"text\", \"text\": predicted_action_state_sequence}\n", - " ]\n", - " }\n", - " ]\n", - " text = processor.apply_chat_template(messages, add_generation_prompt=False)\n", - " texts.append(text.strip())\n", - " images.append([image])\n", - "\n", - " batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n", - " labels = batch[\"input_ids\"].clone()\n", - " labels[labels == processor.tokenizer.pad_token_id] = -100\n", - " labels[labels == image_token_id] = -100\n", - " batch[\"labels\"] = labels\n", - "\n", - " return batch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kEYDjWpE3LD5" - }, - "source": [ - "## Training" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QvAs896cdwg8" - }, - "source": [ - "We can now initialize `Trainer`Β and initialize `TrainingArguments`Β to pass to `Trainer`.\n", - "\n", - "Some notes:\n", - "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n", - "- We use gradient accumulation to simulate a larger batch size.\n", - "- We also save up on memory from intermediate activations by using gradient checkpointing.\n", - "\n", - "**Disclaimer:**\n", - "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n", - "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QNE2yWAYrAhD" - }, - "outputs": [], - "source": [ - "from transformers import TrainingArguments, Trainer\n", - "\n", - "model_name = \"smolvlm-lerobot\"\n", - "\n", - "training_args = TrainingArguments(\n", - " num_train_epochs=1,\n", - " per_device_train_batch_size=4,\n", - " gradient_accumulation_steps=4,\n", - " warmup_steps=50,\n", - " learning_rate=1e-4,\n", - " weight_decay=0.01,\n", - " logging_steps=25,\n", - " save_strategy=\"steps\",\n", - " save_steps=250,\n", - " save_total_limit=1,\n", - " optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n", - " bf16=True, #Β underlying precision for 8bit\n", - " output_dir=f\"./{model_name}-pusht\",\n", - " hub_model_id=f\"{model_name}-pusht\",\n", - " report_to=\"wandb\",\n", - " remove_unused_columns=False,\n", - " gradient_checkpointing=True\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "oBBSDpBhreJd", - "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4" - }, - "outputs": [], - "source": [ - "trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " data_collator=collate_fn,\n", - " train_dataset=train_ds,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_QOCpw_-uYYo", - "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea" - }, - "outputs": [], - "source": [ - "trainer.train()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0hN0QD9_uYYo" - }, - "outputs": [], - "source": [ - "trainer.push_to_hub()" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "A100", - "include_colab_link": true, - "name": "Smol_VLM_FT.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "zk0", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/lerobot/common/policies/smolvlm/requirements.txt b/lerobot/common/policies/smolvlm/requirements.txt deleted file mode 100644 index f4153b89..00000000 --- a/lerobot/common/policies/smolvlm/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ -transformers -trl -peft -accelerate -datasets -wandb -bitsandbytes \ No newline at end of file From 292037247680e800ed465894febbde58aa6175ac Mon Sep 17 00:00:00 2001 From: ivelin Date: Mon, 27 Jan 2025 15:17:19 -0600 Subject: [PATCH 4/5] docs: explain how to determine which cable is disconnected Signed-off-by: ivelin --- examples/10_use_so100.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md index 57fa832d..be331456 100644 --- a/examples/10_use_so100.md +++ b/examples/10_use_so100.md @@ -144,7 +144,7 @@ Follow step 4 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=610). The f ### c. Troubleshooting Sometimes during assembly, the cables connecting the motors or the power adapter cable may be accidentally disconnected. To be sure that the motors are properly connected and functioning after assembly, use the scan_motors tool to test each arm. -The output should look similar to the example below. If the list of motor IDs is shorter than 6, there is probably a poorly connected cable or a motor is misconfigured. +The output should look similar to the example below. If the list of motor IDs is shorter than 6, there is probably a poorly connected cable or a motor is misconfigured. For example if only motor IDs [1,3] show up, that indicates that the serial cable between motor 3 and 4 is disconnected. ```bash lerobot$ python lerobot/scripts/scan_motors.py --port /dev/ttyACM0 --brand feetech --model sts3215 From 1a4f0983b7d16e0b65a7185f0ed58421df0a1fa8 Mon Sep 17 00:00:00 2001 From: ivelin Date: Tue, 28 Jan 2025 08:33:13 -0600 Subject: [PATCH 5/5] docs: add calibrate troubleshooting sub-section for shaft out of position problem Signed-off-by: ivelin --- examples/10_use_so100.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md index be331456..7f33860b 100644 --- a/examples/10_use_so100.md +++ b/examples/10_use_so100.md @@ -204,7 +204,6 @@ python lerobot/scripts/control_robot.py calibrate \ --robot-overrides '~cameras' --arms main_follower ``` - #### b. Manual calibration of leader arm Follow step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the manual calibration. You will need to move the leader arm to these positions sequentially: @@ -220,6 +219,19 @@ python lerobot/scripts/control_robot.py calibrate \ ``` +### c. Troubleshooting + +Another known issue during calibration is related to the positioning of the motor shafts. The error message looks like this: + +``` +Calibration is done! Saving calibration file '.cache/calibration/moss/main_leader.json' +Activating torque on main follower arm. +Wrong motor position range detected for gripper. Expected to be in nominal range of [0, 100] % (a full linear translation), with a maximum range of [-10, 110] % to account for some imprecision during calibration, but present value is 143.7950897216797 %. This might be due to a cable connection issue creating an artificial jump in motor values. You need to recalibrate ... +``` + +When all joints are mounted and the robot arm is in resting position (as shown in the calibration photos) each motor shaft should be in approximately middle state ~2048 of its full [0,4096] range. You can use the `scan_motors.py` tool again as shown above to check Present Position for each motor. Motors that are too far off the mid-value in rested arm position need to be repositioned by unscrewing the attached 3D printed part and rotating it such that when screwed back on and returned to resting position, its Present Position reading is about mid-range. + + ## F. Teleoperate **Simple teleop**