From 4fb01aef68d1de62a1ceb62eb9ee567262024959 Mon Sep 17 00:00:00 2001
From: ivelin <ivelin117@gmail.com>
Date: Mon, 23 Dec 2024 15:01:43 -0600
Subject: [PATCH 1/5] feat: smolvlm model with pusht train notebook

Signed-off-by: ivelin <ivelin117@gmail.com>
---
 .../common/policies/smolvlm/Smol_VLM_FT.ipynb | 413 +++++++++++++++
 .../policies/smolvlm/Smol_VLM_lerobot.ipynb   | 499 ++++++++++++++++++
 .../common/policies/smolvlm/requirements.txt  |   7 +
 3 files changed, 919 insertions(+)
 create mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb
 create mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb
 create mode 100644 lerobot/common/policies/smolvlm/requirements.txt
diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb
new file mode 100644
index 00000000..93907ef1
--- /dev/null
+++ b/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb
@@ -0,0 +1,413 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "view-in-github"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/merveenoyan/smollm/blob/main/finetuning/Smol_VLM_FT.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nc0g2NLpUSGr"
+   },
+   "source": [
+    "# Fine-tune SmolVLM on Visual Question Answering using Consumer GPU with QLoRA\n",
+    "\n",
+    "In this notebook we will fine-tune SmolVLM VQAv2 dataset. With this notebook you can also fine-tune Idefics3, since both models have the same model class/architecture.\n",
+    "\n",
+    "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WIhA1lQ7j0kw"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q accelerate datasets peft bitsandbytes tensorboard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "XyJaqZZ3uYYl"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q flash-attn --no-build-isolation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wAeMA0heVBjT"
+   },
+   "source": [
+    "We will push out model to Hub so we need to authenticate ourselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "yKd5xtSGj7cm"
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WRq8ve-LVAzU"
+   },
+   "source": [
+    "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA` to False and `USE_LORA` to True."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "referenced_widgets": [
+      "23d3d175e6e642c7abc2bce09b73cf4d",
+      "db6ca8f47f274464b135909c907c946a",
+      "d05822c6293c424fbf9df6ec0f6b532b",
+      "05582fca18f443d6965776a721e30e9f",
+      "3d8974fd1ba9415c8070c1eab8ad75cb",
+      "648257c1b1c24e25a26355bddf75aa41",
+      "afa9a31c6b7f45e082ae07dea4a2600e",
+      "92232af543a4446cac53e4fcf3f4b6e1",
+      "a5f06e59634f4edf9f3d9409846a2b31",
+      "7ddfa8718bc24882ba2b50a899656107",
+      "5983728a1c1e43edb4d16bee6ad40171",
+      "dff574197f1f4466abb0eb46d36b8378"
+     ]
+    },
+    "id": "b9CDMq0duYYn",
+    "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n",
+    "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n",
+    "\n",
+    "USE_LORA = False\n",
+    "USE_QLORA = True\n",
+    "SMOL = True\n",
+    "\n",
+    "model_id = \"HuggingFaceTB/SmolVLM-Instruct\" if SMOL else \"HuggingFaceM4/Idefics3-8B-Llama3\"\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(\n",
+    "    model_id\n",
+    ")\n",
+    "\n",
+    "if USE_QLORA or USE_LORA:\n",
+    "    lora_config = LoraConfig(\n",
+    "        r=8,\n",
+    "        lora_alpha=8,\n",
+    "        lora_dropout=0.1,\n",
+    "        target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n",
+    "        use_dora=False if USE_QLORA else True,\n",
+    "        init_lora_weights=\"gaussian\"\n",
+    "    )\n",
+    "    lora_config.inference_mode = False\n",
+    "    if USE_QLORA:\n",
+    "        bnb_config = BitsAndBytesConfig(\n",
+    "            load_in_4bit=True,\n",
+    "            bnb_4bit_use_double_quant=True,\n",
+    "            bnb_4bit_quant_type=\"nf4\",\n",
+    "            bnb_4bit_compute_dtype=torch.bfloat16\n",
+    "        )\n",
+    "\n",
+    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
+    "        model_id,\n",
+    "        quantization_config=bnb_config if USE_QLORA else None,\n",
+    "        _attn_implementation=\"flash_attention_2\",\n",
+    "        device_map=\"auto\"\n",
+    "    )\n",
+    "    model.add_adapter(lora_config)\n",
+    "    model.enable_adapters()\n",
+    "    model = prepare_model_for_kbit_training(model)\n",
+    "    model = get_peft_model(model, lora_config)\n",
+    "    print(model.get_nb_trainable_parameters())\n",
+    "else:\n",
+    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
+    "        model_id,\n",
+    "        torch_dtype=torch.bfloat16,\n",
+    "        _attn_implementation=\"flash_attention_2\",\n",
+    "    ).to(DEVICE)\n",
+    "\n",
+    "    # if you'd like to only fine-tune LLM\n",
+    "    for param in model.model.vision_model.parameters():\n",
+    "        param.requires_grad = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WIVhpp0EyZO2"
+   },
+   "source": [
+    "The model as is is holding 2.7 GB of GPU RAM 💗"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LMTtg3dl3NX2"
+   },
+   "source": [
+    "## Loading the dataset and Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pWHMWTSZ3Pyr"
+   },
+   "source": [
+    "We will load a small portion of the VQAv2 dataset. We are loading a small portion of the model for education purposes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "POOqKqYRka5O"
+   },
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "ds = load_dataset('merve/vqav2-small', trust_remote_code=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "Znf9vMo5rnSd"
+   },
+   "outputs": [],
+   "source": [
+    "split_ds = ds[\"validation\"].train_test_split(test_size=0.5)\n",
+    "train_ds = split_ds[\"train\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "FIDioFlRuYYn",
+    "outputId": "79b697a7-d245-4fdc-b0e8-d9ffa8627953"
+   },
+   "outputs": [],
+   "source": [
+    "train_ds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5nwMO3n0X7Hv"
+   },
+   "source": [
+    "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "e0krVLZ-wNMl"
+   },
+   "outputs": [],
+   "source": [
+    "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n",
+    "            processor.tokenizer.additional_special_tokens.index(\"<image>\")]\n",
+    "\n",
+    "def collate_fn(examples):\n",
+    "  texts = []\n",
+    "  images = []\n",
+    "  for example in examples:\n",
+    "      image = example[\"image\"]\n",
+    "      if image.mode != 'RGB':\n",
+    "        image = image.convert('RGB')\n",
+    "      question = example[\"question\"]\n",
+    "      answer = example[\"multiple_choice_answer\"]\n",
+    "      messages = [\n",
+    "          {\n",
+    "              \"role\": \"user\",\n",
+    "              \"content\": [\n",
+    "                  {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n",
+    "                  {\"type\": \"image\"},\n",
+    "                  {\"type\": \"text\", \"text\": question}\n",
+    "              ]\n",
+    "          },\n",
+    "          {\n",
+    "              \"role\": \"assistant\",\n",
+    "              \"content\": [\n",
+    "                  {\"type\": \"text\", \"text\": answer}\n",
+    "              ]\n",
+    "          }\n",
+    "      ]\n",
+    "      text = processor.apply_chat_template(messages, add_generation_prompt=False)\n",
+    "      texts.append(text.strip())\n",
+    "      images.append([image])\n",
+    "\n",
+    "  batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n",
+    "  labels = batch[\"input_ids\"].clone()\n",
+    "  labels[labels == processor.tokenizer.pad_token_id] = -100\n",
+    "  labels[labels == image_token_id] = -100\n",
+    "  batch[\"labels\"] = labels\n",
+    "\n",
+    "  return batch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kEYDjWpE3LD5"
+   },
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QvAs896cdwg8"
+   },
+   "source": [
+    "We can now initialize `Trainer` and initialize `TrainingArguments` to pass to `Trainer`.\n",
+    "\n",
+    "Some notes:\n",
+    "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n",
+    "- We use gradient accumulation to simulate a larger batch size.\n",
+    "- We also save up on memory from intermediate activations by using gradient checkpointing.\n",
+    "\n",
+    "**Disclaimer:**\n",
+    "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n",
+    "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "QNE2yWAYrAhD"
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments, Trainer\n",
+    "\n",
+    "model_name = model_id.split(\"/\")[-1]\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    warmup_steps=50,\n",
+    "    learning_rate=1e-4,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_steps=25,\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=250,\n",
+    "    save_total_limit=1,\n",
+    "    optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n",
+    "    bf16=True, # underlying precision for 8bit\n",
+    "    output_dir=f\"./{model_name}-vqav2\",\n",
+    "    hub_model_id=f\"{model_name}-vqav2\",\n",
+    "    report_to=\"wandb\",\n",
+    "    remove_unused_columns=False,\n",
+    "    gradient_checkpointing=True\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "oBBSDpBhreJd",
+    "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4"
+   },
+   "outputs": [],
+   "source": [
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    data_collator=collate_fn,\n",
+    "    train_dataset=train_ds,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_QOCpw_-uYYo",
+    "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea"
+   },
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "0hN0QD9_uYYo"
+   },
+   "outputs": [],
+   "source": [
+    "trainer.push_to_hub()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "A100",
+   "include_colab_link": true,
+   "name": "Smol_VLM_FT.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "zk0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb
new file mode 100644
index 00000000..84de0d93
--- /dev/null
+++ b/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb
@@ -0,0 +1,499 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "view-in-github"
+   },
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/merveenoyan/smollm/blob/main/finetuning/Smol_VLM_FT.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "nc0g2NLpUSGr"
+   },
+   "source": [
+    "# Fine-tune SmolVLM on LeRobot Dataset using Consumer GPU with QLoRA\n",
+    "\n",
+    "In this notebook we will fine-tune SmolVLM Instruct on LeRobot PushT dataset. It is based on the SmolVLM VQA2 fine tuning, which is in turn based on Idefics3 model class/architecture.\n",
+    "\n",
+    "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WIhA1lQ7j0kw"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q accelerate datasets peft bitsandbytes tensorboard"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "XyJaqZZ3uYYl"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q flash-attn --no-build-isolation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wAeMA0heVBjT"
+   },
+   "source": [
+    "We will push out model to Hub so we need to authenticate ourselves."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "yKd5xtSGj7cm"
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WRq8ve-LVAzU"
+   },
+   "source": [
+    "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA` to False and `USE_LORA` to True."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "referenced_widgets": [
+      "23d3d175e6e642c7abc2bce09b73cf4d",
+      "db6ca8f47f274464b135909c907c946a",
+      "d05822c6293c424fbf9df6ec0f6b532b",
+      "05582fca18f443d6965776a721e30e9f",
+      "3d8974fd1ba9415c8070c1eab8ad75cb",
+      "648257c1b1c24e25a26355bddf75aa41",
+      "afa9a31c6b7f45e082ae07dea4a2600e",
+      "92232af543a4446cac53e4fcf3f4b6e1",
+      "a5f06e59634f4edf9f3d9409846a2b31",
+      "7ddfa8718bc24882ba2b50a899656107",
+      "5983728a1c1e43edb4d16bee6ad40171",
+      "dff574197f1f4466abb0eb46d36b8378"
+     ]
+    },
+    "id": "b9CDMq0duYYn",
+    "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04"
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n",
+    "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n",
+    "\n",
+    "USE_LORA = False\n",
+    "USE_QLORA = True\n",
+    "\n",
+    "model_id = \"HuggingFaceTB/SmolVLM-Instruct\"\n",
+    "\n",
+    "processor = AutoProcessor.from_pretrained(\n",
+    "    model_id\n",
+    ")\n",
+    "\n",
+    "if USE_QLORA or USE_LORA:\n",
+    "    lora_config = LoraConfig(\n",
+    "        r=8,\n",
+    "        lora_alpha=8,\n",
+    "        lora_dropout=0.1,\n",
+    "        target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n",
+    "        use_dora=False if USE_QLORA else True,\n",
+    "        init_lora_weights=\"gaussian\"\n",
+    "    )\n",
+    "    lora_config.inference_mode = False\n",
+    "    if USE_QLORA:\n",
+    "        bnb_config = BitsAndBytesConfig(\n",
+    "            load_in_4bit=True,\n",
+    "            bnb_4bit_use_double_quant=True,\n",
+    "            bnb_4bit_quant_type=\"nf4\",\n",
+    "            bnb_4bit_compute_dtype=torch.bfloat16\n",
+    "        )\n",
+    "\n",
+    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
+    "        model_id,\n",
+    "        quantization_config=bnb_config if USE_QLORA else None,\n",
+    "        _attn_implementation=\"flash_attention_2\",\n",
+    "        device_map=\"auto\"\n",
+    "    )\n",
+    "    model.add_adapter(lora_config)\n",
+    "    model.enable_adapters()\n",
+    "    model = prepare_model_for_kbit_training(model)\n",
+    "    model = get_peft_model(model, lora_config)\n",
+    "    print(model.get_nb_trainable_parameters())\n",
+    "else:\n",
+    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
+    "        model_id,\n",
+    "        torch_dtype=torch.bfloat16,\n",
+    "        _attn_implementation=\"flash_attention_2\",\n",
+    "        device_map=\"auto\"        \n",
+    "    )\n",
+    "\n",
+    "    # if you'd like to only fine-tune LLM\n",
+    "    for param in model.model.vision_model.parameters():\n",
+    "        param.requires_grad = False"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "WIVhpp0EyZO2"
+   },
+   "source": [
+    "The model as is is holding 2.7 GB of GPU RAM 💗"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "LMTtg3dl3NX2"
+   },
+   "source": [
+    "## Loading the dataset and Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "pWHMWTSZ3Pyr"
+   },
+   "source": [
+    "We will load a small portion of the PushT dataset. We are loading a small portion of the model for education purposes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "from lerobot.common.datasets.lerobot_dataset import LeRobotDataset\n",
+    "from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig\n",
+    "from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy\n",
+    "\n",
+    "# Create a directory to store the training checkpoint.\n",
+    "output_directory = Path(\"outputs/train/example_pusht_diffusion\")\n",
+    "output_directory.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "# Number of offline training steps (we'll only do offline training for this example.)\n",
+    "# Adjust as you prefer. 5000 steps are needed to get something worth evaluating.\n",
+    "training_steps = 5000\n",
+    "device = torch.device(\"cuda\")\n",
+    "log_freq = 250\n",
+    "\n",
+    "# Set up the dataset.\n",
+    "delta_timestamps = {\n",
+    "    # Load the previous image and state at -0.1 seconds before current frame,\n",
+    "    # then load current image and state corresponding to 0.0 second.\n",
+    "    \"observation.image\": [-0.1, 0.0],\n",
+    "    \"observation.state\": [-0.1, 0.0],\n",
+    "    # Load the previous action (-0.1), the next action to be executed (0.0),\n",
+    "    # and 14 future actions with a 0.1 seconds spacing. All these actions will be\n",
+    "    # used to supervise the policy.\n",
+    "    \"action\": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],\n",
+    "}\n",
+    "dataset = LeRobotDataset(\"lerobot/pusht\", delta_timestamps=delta_timestamps)\n",
+    "\n",
+    "# Set up the the policy.\n",
+    "# Policies are initialized with a configuration class, in this case `DiffusionConfig`.\n",
+    "# For this example, no arguments need to be passed because the defaults are set up for PushT.\n",
+    "# If you're doing something different, you will likely need to change at least some of the defaults.\n",
+    "# cfg = DiffusionConfig()\n",
+    "# policy = DiffusionPolicy(cfg, dataset_stats=dataset.meta.stats)\n",
+    "# policy.train()\n",
+    "# policy.to(device)\n",
+    "# optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)\n",
+    "\n",
+    "# Create dataloader for offline training.\n",
+    "dataloader = torch.utils.data.DataLoader(\n",
+    "    dataset,\n",
+    "    num_workers=4,\n",
+    "    batch_size=64,\n",
+    "    shuffle=True,\n",
+    "    pin_memory=device != torch.device(\"cpu\"),\n",
+    "    drop_last=True,\n",
+    ")\n",
+    "\n",
+    "# Since pusht is evaluated via gym, we will use all pre-recorded data for training\n",
+    "# and generate the test/eval data later\n",
+    "\n",
+    "train_ds = dataloader"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "5nwMO3n0X7Hv"
+   },
+   "source": [
+    "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "e0krVLZ-wNMl"
+   },
+   "outputs": [],
+   "source": [
+    "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n",
+    "            processor.tokenizer.additional_special_tokens.index(\"<image>\")]\n",
+    "\n",
+    "def collate_fn(examples):\n",
+    "  \"\"\"\n",
+    "    Transform lerobot v2.0 formatted pusht dataset samples to smolvlm input format.\n",
+    "    pusht dataset format spec: \n",
+    "        https://huggingface.co/datasets/lerobot/pusht/blob/main/meta/info.json\n",
+    "    smolvlm dataset format spec: \n",
+    "        https://huggingface.co/datasets/merve/vqav2-small based on https://huggingface.co/merve/idefics3llama-vqav2\n",
+    "  \"\"\"\n",
+    "  tasks = task = dataset.meta[\"tasks\"]\n",
+    "  texts = []\n",
+    "  images = []\n",
+    "  for example in examples:\n",
+    "      # image = example[\"image\"]\n",
+    "      # if image.mode != 'RGB':\n",
+    "      #   image = image.convert('RGB')\n",
+    "      # question = example[\"question\"]\n",
+    "      # answer = example[\"multiple_choice_answer\"]\n",
+    "      observation_image = example[\"observation.image\"]\n",
+    "      observation_state = example[\"observation.state\"]\n",
+    "      action = example[\"action\"]\n",
+    "      timestamp = example[\"timestamp\"]\n",
+    "      task = tasks[example[\"task_index\"]]\n",
+    "\n",
+    "      messages = [\n",
+    "          {\n",
+    "              \"role\": \"user\",\n",
+    "              \"content\": [\n",
+    "                  # {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n",
+    "                  # {\"type\": \"text\", \"text\": question},\n",
+    "                  {\"type\": \"text\", \"text\": \"Answer with a sequence of up to 14 robot action states in json format.\"},\n",
+    "                  {\"type\": \"text\", \"text\": task},\n",
+    "                  {\"type\": \"image\"},\n",
+    "                  {\"type\": \"text\", \"text\": previous_action_state},\n",
+    "                  {\"type\": \"image\"},\n",
+    "                  {\"type\": \"text\", \"text\": current_action_state},\n",
+    "                  \n",
+    "              ]\n",
+    "          },\n",
+    "          {\n",
+    "              \"role\": \"assistant\",\n",
+    "              \"content\": [\n",
+    "                  # {\"type\": \"text\", \"text\": answer}\n",
+    "                  {\"type\": \"text\", \"text\": predicted_action_state_sequence}\n",
+    "              ]\n",
+    "          }\n",
+    "      ]\n",
+    "      text = processor.apply_chat_template(messages, add_generation_prompt=False)\n",
+    "      texts.append(text.strip())\n",
+    "      images.append([image])\n",
+    "\n",
+    "  batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n",
+    "  labels = batch[\"input_ids\"].clone()\n",
+    "  labels[labels == processor.tokenizer.pad_token_id] = -100\n",
+    "  labels[labels == image_token_id] = -100\n",
+    "  batch[\"labels\"] = labels\n",
+    "\n",
+    "  return batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "kEYDjWpE3LD5"
+   },
+   "source": [
+    "## Training"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "QvAs896cdwg8"
+   },
+   "source": [
+    "We can now initialize `Trainer` and initialize `TrainingArguments` to pass to `Trainer`.\n",
+    "\n",
+    "Some notes:\n",
+    "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n",
+    "- We use gradient accumulation to simulate a larger batch size.\n",
+    "- We also save up on memory from intermediate activations by using gradient checkpointing.\n",
+    "\n",
+    "**Disclaimer:**\n",
+    "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n",
+    "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "QNE2yWAYrAhD"
+   },
+   "outputs": [],
+   "source": [
+    "from transformers import TrainingArguments, Trainer\n",
+    "\n",
+    "model_name = \"smolvlm-lerobot\"\n",
+    "\n",
+    "training_args = TrainingArguments(\n",
+    "    num_train_epochs=1,\n",
+    "    per_device_train_batch_size=4,\n",
+    "    gradient_accumulation_steps=4,\n",
+    "    warmup_steps=50,\n",
+    "    learning_rate=1e-4,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_steps=25,\n",
+    "    save_strategy=\"steps\",\n",
+    "    save_steps=250,\n",
+    "    save_total_limit=1,\n",
+    "    optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n",
+    "    bf16=True, # underlying precision for 8bit\n",
+    "    output_dir=f\"./{model_name}-pusht\",\n",
+    "    hub_model_id=f\"{model_name}-pusht\",\n",
+    "    report_to=\"wandb\",\n",
+    "    remove_unused_columns=False,\n",
+    "    gradient_checkpointing=True\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "oBBSDpBhreJd",
+    "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4"
+   },
+   "outputs": [],
+   "source": [
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    data_collator=collate_fn,\n",
+    "    train_dataset=train_ds,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_QOCpw_-uYYo",
+    "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea"
+   },
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "0hN0QD9_uYYo"
+   },
+   "outputs": [],
+   "source": [
+    "trainer.push_to_hub()"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "A100",
+   "include_colab_link": true,
+   "name": "Smol_VLM_FT.ipynb",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "zk0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/lerobot/common/policies/smolvlm/requirements.txt b/lerobot/common/policies/smolvlm/requirements.txt
new file mode 100644
index 00000000..f4153b89
--- /dev/null
+++ b/lerobot/common/policies/smolvlm/requirements.txt
@@ -0,0 +1,7 @@
+transformers
+trl
+peft
+accelerate
+datasets
+wandb
+bitsandbytes
\ No newline at end of file

From 0825e2e14c5937f5f98ae2a60bb339317cfb5ec9 Mon Sep 17 00:00:00 2001
From: ivelin <ivelin117@gmail.com>
Date: Mon, 27 Jan 2025 14:54:44 -0600
Subject: [PATCH 2/5] feat: add scan_motors.py tool for troubleshooting

Signed-off-by: ivelin <ivelin117@gmail.com>
---
 .gitignore                     |  3 ++
 examples/10_use_so100.md       | 46 ++++++++++++++++
 lerobot/scripts/scan_motors.py | 97 ++++++++++++++++++++++++++++++++++
 3 files changed, 146 insertions(+)
 create mode 100644 lerobot/scripts/scan_motors.py

diff --git a/.gitignore b/.gitignore
index 0e203a39..79fdc517 100644
--- a/.gitignore
+++ b/.gitignore
@@ -153,3 +153,6 @@ dmypy.json
 
 # Cython debug symbols
 cython_debug/
+
+# lerobot calibration cache
+.cache
diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md
index 155bbe51..57fa832d 100644
--- a/examples/10_use_so100.md
+++ b/examples/10_use_so100.md
@@ -140,10 +140,54 @@ Try to avoid rotating the motor while doing so to keep position 2048 set during
 
 Follow step 4 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=610). The first arm should take a bit more than 1 hour to assemble, but once you get use to it, you can do it under 1 hour for the second arm.
 
+
+### c. Troubleshooting
+
+Sometimes during assembly, the cables connecting the motors or the power adapter cable may be accidentally disconnected. To be sure that the motors are properly connected and functioning after assembly, use the scan_motors tool to test each arm.
+The output should look similar to the example below. If the list of motor IDs is shorter than 6, there is probably a poorly connected cable or a motor is misconfigured.
+
+```bash
+lerobot$ python lerobot/scripts/scan_motors.py   --port /dev/ttyACM0   --brand feetech   --model sts3215 
+Connected on port /dev/ttyACM0
+Scanning all baudrates and motor indices
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 43.11it/s]
+6 motor ID(s) detected for baud rate 1000000. Motor IDs: [1, 2, 3, 4, 5, 6].
+Present Position 1997
+Offset 0
+Present Position 2098
+Offset 0
+Present Position 1124
+Offset 0
+Present Position 253
+Offset 0
+Present Position 3769
+Offset 0
+Present Position 926
+Offset 0
+Setting bus baud rate to 128000. Previously 1000000.
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 14.42it/s]
+Setting bus baud rate to 500000. Previously 128000.
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 14.50it/s]
+Setting bus baud rate to 115200. Previously 500000.
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 14.40it/s]
+Setting bus baud rate to 57600. Previously 115200.
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 14.08it/s]
+Setting bus baud rate to 38400. Previously 57600.
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 13.88it/s]
+Setting bus baud rate to 19200. Previously 38400.
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 13.24it/s]
+Setting bus baud rate to 250000. Previously 19200.
+100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 14.47it/s]
+Scan finished.
+Disconnected from motor bus.
+```
+
+
 ## E. Calibrate
 
 Next, you'll need to calibrate your SO-100 robot to ensure that the leader and follower arms have the same position values when they are in the same physical position. This calibration is essential because it allows a neural network trained on one SO-100 robot to work on another.
 
+
 #### a. Manual calibration of follower arm
 /!\ Contrarily to step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the auto calibration, we will actually do manual calibration of follower for now.
 
@@ -160,6 +204,7 @@ python lerobot/scripts/control_robot.py calibrate \
     --robot-overrides '~cameras' --arms main_follower
 ```
 
+
 #### b. Manual calibration of leader arm
 Follow step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the manual calibration. You will need to move the leader arm to these positions sequentially:
 
@@ -174,6 +219,7 @@ python lerobot/scripts/control_robot.py calibrate \
     --robot-overrides '~cameras' --arms main_leader
 ```
 
+
 ## F. Teleoperate
 
 **Simple teleop**
diff --git a/lerobot/scripts/scan_motors.py b/lerobot/scripts/scan_motors.py
new file mode 100644
index 00000000..498bf8e5
--- /dev/null
+++ b/lerobot/scripts/scan_motors.py
@@ -0,0 +1,97 @@
+"""
+This script is helpful to diagnose issues with motors during calibration or later operation. 
+It scans all motors on a given port and displays their settings.
+
+Example of usage:
+```bash
+python lerobot/scripts/scan_motors.py \
+  --port /dev/tty.usbmodem585A0080521 \
+  --brand feetech \
+  --model sts3215 \
+```
+"""
+
+import argparse
+import time
+
+
+def scan_motors(port, brand, model):
+    if brand == "feetech":
+        from lerobot.common.robot_devices.motors.feetech import MODEL_BAUDRATE_TABLE
+        from lerobot.common.robot_devices.motors.feetech import (
+            SCS_SERIES_BAUDRATE_TABLE as SERIES_BAUDRATE_TABLE,
+        )
+        from lerobot.common.robot_devices.motors.feetech import FeetechMotorsBus as MotorsBusClass
+    elif brand == "dynamixel":
+        from lerobot.common.robot_devices.motors.dynamixel import MODEL_BAUDRATE_TABLE
+        from lerobot.common.robot_devices.motors.dynamixel import (
+            X_SERIES_BAUDRATE_TABLE as SERIES_BAUDRATE_TABLE,
+        )
+        from lerobot.common.robot_devices.motors.dynamixel import DynamixelMotorsBus as MotorsBusClass
+    else:
+        raise ValueError(
+            f"Currently we do not support this motor brand: {brand}. We currently support feetech and dynamixel motors."
+        )
+
+    # Check if the provided model exists in the model_baud_rate_table
+    if model not in MODEL_BAUDRATE_TABLE:
+        raise ValueError(
+            f"Invalid model '{model}' for brand '{brand}'. Supported models: {list(MODEL_BAUDRATE_TABLE.keys())}"
+        )
+
+    # Setup motor names, indices, and models
+    motor_name = "motor"
+    motor_index_arbitrary = -1  # Use an arbitrary out of range motor ID
+    motor_model = model  # Use the motor model passed via argument
+
+    # Initialize the MotorBus with the correct port and motor configurations
+    motor_bus = MotorsBusClass(port=port, motors={motor_name: (motor_index_arbitrary, motor_model)})
+
+    # Try to connect to the motor bus and handle any connection-specific errors
+    try:
+        motor_bus.connect()
+        print(f"Connected on port {motor_bus.port}")
+    except OSError as e:
+        print(f"Error occurred when connecting to the motor bus: {e}")
+        return
+
+    # Motor bus is connected, proceed with the rest of the operations
+    try:
+        print("Scanning all baudrates and motor indices")
+        all_baudrates = set(SERIES_BAUDRATE_TABLE.values())
+        motors_detected = False
+
+        for baudrate in all_baudrates:
+            motor_bus.set_bus_baudrate(baudrate)
+            present_ids = motor_bus.find_motor_indices(list(range(1, 10)))
+            if len(present_ids) > 0:
+                print(f"{len(present_ids)} motor ID(s) detected for baud rate {baudrate}. Motor IDs: {present_ids}.")
+                motors_detected = True
+
+            for motor_idx in present_ids:
+                present_idx = motor_bus.read_with_motor_ids(motor_bus.motor_models, motor_idx, "ID", num_retry=2)
+                if present_idx != motor_idx:
+                    raise OSError(f"Failed to access motor index {motor_idx}.")
+
+                if brand == "feetech":
+                    print("Present Position", motor_bus.read_with_motor_ids(motor_bus.motor_models, motor_idx, "Present_Position", num_retry=2))
+                    print("Offset", motor_bus.read_with_motor_ids(motor_bus.motor_models, motor_idx, "Offset", num_retry=2))
+
+        if not motors_detected:
+            print("No motors detected.")
+
+        print("Scan finished.")
+
+    finally:
+        motor_bus.disconnect()
+        print("Disconnected from motor bus.")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=str, required=True, help="Motors bus port (e.g. dynamixel,feetech)")
+    parser.add_argument("--brand", type=str, required=True, help="Motor brand (e.g. dynamixel,feetech)")
+    parser.add_argument("--model", type=str, required=True, help="Motor model (e.g. xl330-m077,sts3215)")
+    args = parser.parse_args()
+
+    scan_motors(args.port, args.brand, args.model)

From 843c0996976d3f8f3f086bedfb38c072a73ec643 Mon Sep 17 00:00:00 2001
From: ivelin <ivelin117@gmail.com>
Date: Mon, 27 Jan 2025 15:09:51 -0600
Subject: [PATCH 3/5] fix: remove work in progress files from branch

Signed-off-by: ivelin <ivelin117@gmail.com>
---
 .../common/policies/smolvlm/Smol_VLM_FT.ipynb | 413 ---------------
 .../policies/smolvlm/Smol_VLM_lerobot.ipynb   | 499 ------------------
 .../common/policies/smolvlm/requirements.txt  |   7 -
 3 files changed, 919 deletions(-)
 delete mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb
 delete mode 100644 lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb
 delete mode 100644 lerobot/common/policies/smolvlm/requirements.txt

diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb
deleted file mode 100644
index 93907ef1..00000000
--- a/lerobot/common/policies/smolvlm/Smol_VLM_FT.ipynb
+++ /dev/null
@@ -1,413 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "view-in-github"
-   },
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/merveenoyan/smollm/blob/main/finetuning/Smol_VLM_FT.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "nc0g2NLpUSGr"
-   },
-   "source": [
-    "# Fine-tune SmolVLM on Visual Question Answering using Consumer GPU with QLoRA\n",
-    "\n",
-    "In this notebook we will fine-tune SmolVLM VQAv2 dataset. With this notebook you can also fine-tune Idefics3, since both models have the same model class/architecture.\n",
-    "\n",
-    "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "WIhA1lQ7j0kw"
-   },
-   "outputs": [],
-   "source": [
-    "!pip install -q accelerate datasets peft bitsandbytes tensorboard"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "XyJaqZZ3uYYl"
-   },
-   "outputs": [],
-   "source": [
-    "!pip install -q flash-attn --no-build-isolation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "wAeMA0heVBjT"
-   },
-   "source": [
-    "We will push out model to Hub so we need to authenticate ourselves."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "yKd5xtSGj7cm"
-   },
-   "outputs": [],
-   "source": [
-    "from huggingface_hub import notebook_login\n",
-    "\n",
-    "notebook_login()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "WRq8ve-LVAzU"
-   },
-   "source": [
-    "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA` to False and `USE_LORA` to True."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "referenced_widgets": [
-      "23d3d175e6e642c7abc2bce09b73cf4d",
-      "db6ca8f47f274464b135909c907c946a",
-      "d05822c6293c424fbf9df6ec0f6b532b",
-      "05582fca18f443d6965776a721e30e9f",
-      "3d8974fd1ba9415c8070c1eab8ad75cb",
-      "648257c1b1c24e25a26355bddf75aa41",
-      "afa9a31c6b7f45e082ae07dea4a2600e",
-      "92232af543a4446cac53e4fcf3f4b6e1",
-      "a5f06e59634f4edf9f3d9409846a2b31",
-      "7ddfa8718bc24882ba2b50a899656107",
-      "5983728a1c1e43edb4d16bee6ad40171",
-      "dff574197f1f4466abb0eb46d36b8378"
-     ]
-    },
-    "id": "b9CDMq0duYYn",
-    "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04"
-   },
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n",
-    "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n",
-    "\n",
-    "USE_LORA = False\n",
-    "USE_QLORA = True\n",
-    "SMOL = True\n",
-    "\n",
-    "model_id = \"HuggingFaceTB/SmolVLM-Instruct\" if SMOL else \"HuggingFaceM4/Idefics3-8B-Llama3\"\n",
-    "\n",
-    "processor = AutoProcessor.from_pretrained(\n",
-    "    model_id\n",
-    ")\n",
-    "\n",
-    "if USE_QLORA or USE_LORA:\n",
-    "    lora_config = LoraConfig(\n",
-    "        r=8,\n",
-    "        lora_alpha=8,\n",
-    "        lora_dropout=0.1,\n",
-    "        target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n",
-    "        use_dora=False if USE_QLORA else True,\n",
-    "        init_lora_weights=\"gaussian\"\n",
-    "    )\n",
-    "    lora_config.inference_mode = False\n",
-    "    if USE_QLORA:\n",
-    "        bnb_config = BitsAndBytesConfig(\n",
-    "            load_in_4bit=True,\n",
-    "            bnb_4bit_use_double_quant=True,\n",
-    "            bnb_4bit_quant_type=\"nf4\",\n",
-    "            bnb_4bit_compute_dtype=torch.bfloat16\n",
-    "        )\n",
-    "\n",
-    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
-    "        model_id,\n",
-    "        quantization_config=bnb_config if USE_QLORA else None,\n",
-    "        _attn_implementation=\"flash_attention_2\",\n",
-    "        device_map=\"auto\"\n",
-    "    )\n",
-    "    model.add_adapter(lora_config)\n",
-    "    model.enable_adapters()\n",
-    "    model = prepare_model_for_kbit_training(model)\n",
-    "    model = get_peft_model(model, lora_config)\n",
-    "    print(model.get_nb_trainable_parameters())\n",
-    "else:\n",
-    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
-    "        model_id,\n",
-    "        torch_dtype=torch.bfloat16,\n",
-    "        _attn_implementation=\"flash_attention_2\",\n",
-    "    ).to(DEVICE)\n",
-    "\n",
-    "    # if you'd like to only fine-tune LLM\n",
-    "    for param in model.model.vision_model.parameters():\n",
-    "        param.requires_grad = False"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "WIVhpp0EyZO2"
-   },
-   "source": [
-    "The model as is is holding 2.7 GB of GPU RAM 💗"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "LMTtg3dl3NX2"
-   },
-   "source": [
-    "## Loading the dataset and Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "pWHMWTSZ3Pyr"
-   },
-   "source": [
-    "We will load a small portion of the VQAv2 dataset. We are loading a small portion of the model for education purposes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "POOqKqYRka5O"
-   },
-   "outputs": [],
-   "source": [
-    "from datasets import load_dataset\n",
-    "ds = load_dataset('merve/vqav2-small', trust_remote_code=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "Znf9vMo5rnSd"
-   },
-   "outputs": [],
-   "source": [
-    "split_ds = ds[\"validation\"].train_test_split(test_size=0.5)\n",
-    "train_ds = split_ds[\"train\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "FIDioFlRuYYn",
-    "outputId": "79b697a7-d245-4fdc-b0e8-d9ffa8627953"
-   },
-   "outputs": [],
-   "source": [
-    "train_ds"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "5nwMO3n0X7Hv"
-   },
-   "source": [
-    "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "e0krVLZ-wNMl"
-   },
-   "outputs": [],
-   "source": [
-    "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n",
-    "            processor.tokenizer.additional_special_tokens.index(\"<image>\")]\n",
-    "\n",
-    "def collate_fn(examples):\n",
-    "  texts = []\n",
-    "  images = []\n",
-    "  for example in examples:\n",
-    "      image = example[\"image\"]\n",
-    "      if image.mode != 'RGB':\n",
-    "        image = image.convert('RGB')\n",
-    "      question = example[\"question\"]\n",
-    "      answer = example[\"multiple_choice_answer\"]\n",
-    "      messages = [\n",
-    "          {\n",
-    "              \"role\": \"user\",\n",
-    "              \"content\": [\n",
-    "                  {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n",
-    "                  {\"type\": \"image\"},\n",
-    "                  {\"type\": \"text\", \"text\": question}\n",
-    "              ]\n",
-    "          },\n",
-    "          {\n",
-    "              \"role\": \"assistant\",\n",
-    "              \"content\": [\n",
-    "                  {\"type\": \"text\", \"text\": answer}\n",
-    "              ]\n",
-    "          }\n",
-    "      ]\n",
-    "      text = processor.apply_chat_template(messages, add_generation_prompt=False)\n",
-    "      texts.append(text.strip())\n",
-    "      images.append([image])\n",
-    "\n",
-    "  batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n",
-    "  labels = batch[\"input_ids\"].clone()\n",
-    "  labels[labels == processor.tokenizer.pad_token_id] = -100\n",
-    "  labels[labels == image_token_id] = -100\n",
-    "  batch[\"labels\"] = labels\n",
-    "\n",
-    "  return batch"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "kEYDjWpE3LD5"
-   },
-   "source": [
-    "## Training"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "QvAs896cdwg8"
-   },
-   "source": [
-    "We can now initialize `Trainer` and initialize `TrainingArguments` to pass to `Trainer`.\n",
-    "\n",
-    "Some notes:\n",
-    "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n",
-    "- We use gradient accumulation to simulate a larger batch size.\n",
-    "- We also save up on memory from intermediate activations by using gradient checkpointing.\n",
-    "\n",
-    "**Disclaimer:**\n",
-    "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n",
-    "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "QNE2yWAYrAhD"
-   },
-   "outputs": [],
-   "source": [
-    "from transformers import TrainingArguments, Trainer\n",
-    "\n",
-    "model_name = model_id.split(\"/\")[-1]\n",
-    "\n",
-    "training_args = TrainingArguments(\n",
-    "    num_train_epochs=1,\n",
-    "    per_device_train_batch_size=4,\n",
-    "    gradient_accumulation_steps=4,\n",
-    "    warmup_steps=50,\n",
-    "    learning_rate=1e-4,\n",
-    "    weight_decay=0.01,\n",
-    "    logging_steps=25,\n",
-    "    save_strategy=\"steps\",\n",
-    "    save_steps=250,\n",
-    "    save_total_limit=1,\n",
-    "    optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n",
-    "    bf16=True, # underlying precision for 8bit\n",
-    "    output_dir=f\"./{model_name}-vqav2\",\n",
-    "    hub_model_id=f\"{model_name}-vqav2\",\n",
-    "    report_to=\"wandb\",\n",
-    "    remove_unused_columns=False,\n",
-    "    gradient_checkpointing=True\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "oBBSDpBhreJd",
-    "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4"
-   },
-   "outputs": [],
-   "source": [
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    args=training_args,\n",
-    "    data_collator=collate_fn,\n",
-    "    train_dataset=train_ds,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "_QOCpw_-uYYo",
-    "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea"
-   },
-   "outputs": [],
-   "source": [
-    "trainer.train()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "0hN0QD9_uYYo"
-   },
-   "outputs": [],
-   "source": [
-    "trainer.push_to_hub()"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "A100",
-   "include_colab_link": true,
-   "name": "Smol_VLM_FT.ipynb",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "zk0",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.15"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb b/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb
deleted file mode 100644
index 84de0d93..00000000
--- a/lerobot/common/policies/smolvlm/Smol_VLM_lerobot.ipynb
+++ /dev/null
@@ -1,499 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "colab_type": "text",
-    "id": "view-in-github"
-   },
-   "source": [
-    "<a href=\"https://colab.research.google.com/github/merveenoyan/smollm/blob/main/finetuning/Smol_VLM_FT.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "nc0g2NLpUSGr"
-   },
-   "source": [
-    "# Fine-tune SmolVLM on LeRobot Dataset using Consumer GPU with QLoRA\n",
-    "\n",
-    "In this notebook we will fine-tune SmolVLM Instruct on LeRobot PushT dataset. It is based on the SmolVLM VQA2 fine tuning, which is in turn based on Idefics3 model class/architecture.\n",
-    "\n",
-    "We will use some techniques in this notebook that will let you fine-tune the model on L4 with batch size of 4 only using around 16.4 GB of VRAM. We ran this notebook in that setup to test, but because we were able to afford A100 this notebook was last ran on an A100."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "WIhA1lQ7j0kw"
-   },
-   "outputs": [],
-   "source": [
-    "!pip install -q accelerate datasets peft bitsandbytes tensorboard"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "XyJaqZZ3uYYl"
-   },
-   "outputs": [],
-   "source": [
-    "!pip install -q flash-attn --no-build-isolation"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "wAeMA0heVBjT"
-   },
-   "source": [
-    "We will push out model to Hub so we need to authenticate ourselves."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "yKd5xtSGj7cm"
-   },
-   "outputs": [],
-   "source": [
-    "from huggingface_hub import notebook_login\n",
-    "\n",
-    "notebook_login()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "WRq8ve-LVAzU"
-   },
-   "source": [
-    "In this notebook we will not do full fine-tuning but use QLoRA method, which loads an adapter to the quantized version of the model, saving space. If you want to do full fine-tuning, set `USE_LORA` and `USE_QLORA` to False. If you want to do LoRA, set `USE_QLORA` to False and `USE_LORA` to True."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "referenced_widgets": [
-      "23d3d175e6e642c7abc2bce09b73cf4d",
-      "db6ca8f47f274464b135909c907c946a",
-      "d05822c6293c424fbf9df6ec0f6b532b",
-      "05582fca18f443d6965776a721e30e9f",
-      "3d8974fd1ba9415c8070c1eab8ad75cb",
-      "648257c1b1c24e25a26355bddf75aa41",
-      "afa9a31c6b7f45e082ae07dea4a2600e",
-      "92232af543a4446cac53e4fcf3f4b6e1",
-      "a5f06e59634f4edf9f3d9409846a2b31",
-      "7ddfa8718bc24882ba2b50a899656107",
-      "5983728a1c1e43edb4d16bee6ad40171",
-      "dff574197f1f4466abb0eb46d36b8378"
-     ]
-    },
-    "id": "b9CDMq0duYYn",
-    "outputId": "65a4a5fa-fe4d-4243-b2d7-405a8aa81c04"
-   },
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model\n",
-    "from transformers import AutoProcessor, BitsAndBytesConfig, Idefics3ForConditionalGeneration\n",
-    "\n",
-    "USE_LORA = False\n",
-    "USE_QLORA = True\n",
-    "\n",
-    "model_id = \"HuggingFaceTB/SmolVLM-Instruct\"\n",
-    "\n",
-    "processor = AutoProcessor.from_pretrained(\n",
-    "    model_id\n",
-    ")\n",
-    "\n",
-    "if USE_QLORA or USE_LORA:\n",
-    "    lora_config = LoraConfig(\n",
-    "        r=8,\n",
-    "        lora_alpha=8,\n",
-    "        lora_dropout=0.1,\n",
-    "        target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],\n",
-    "        use_dora=False if USE_QLORA else True,\n",
-    "        init_lora_weights=\"gaussian\"\n",
-    "    )\n",
-    "    lora_config.inference_mode = False\n",
-    "    if USE_QLORA:\n",
-    "        bnb_config = BitsAndBytesConfig(\n",
-    "            load_in_4bit=True,\n",
-    "            bnb_4bit_use_double_quant=True,\n",
-    "            bnb_4bit_quant_type=\"nf4\",\n",
-    "            bnb_4bit_compute_dtype=torch.bfloat16\n",
-    "        )\n",
-    "\n",
-    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
-    "        model_id,\n",
-    "        quantization_config=bnb_config if USE_QLORA else None,\n",
-    "        _attn_implementation=\"flash_attention_2\",\n",
-    "        device_map=\"auto\"\n",
-    "    )\n",
-    "    model.add_adapter(lora_config)\n",
-    "    model.enable_adapters()\n",
-    "    model = prepare_model_for_kbit_training(model)\n",
-    "    model = get_peft_model(model, lora_config)\n",
-    "    print(model.get_nb_trainable_parameters())\n",
-    "else:\n",
-    "    model = Idefics3ForConditionalGeneration.from_pretrained(\n",
-    "        model_id,\n",
-    "        torch_dtype=torch.bfloat16,\n",
-    "        _attn_implementation=\"flash_attention_2\",\n",
-    "        device_map=\"auto\"        \n",
-    "    )\n",
-    "\n",
-    "    # if you'd like to only fine-tune LLM\n",
-    "    for param in model.model.vision_model.parameters():\n",
-    "        param.requires_grad = False"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "WIVhpp0EyZO2"
-   },
-   "source": [
-    "The model as is is holding 2.7 GB of GPU RAM 💗"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "LMTtg3dl3NX2"
-   },
-   "source": [
-    "## Loading the dataset and Preprocessing"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "pWHMWTSZ3Pyr"
-   },
-   "source": [
-    "We will load a small portion of the PushT dataset. We are loading a small portion of the model for education purposes."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "from pathlib import Path\n",
-    "\n",
-    "from lerobot.common.datasets.lerobot_dataset import LeRobotDataset\n",
-    "from lerobot.common.policies.diffusion.configuration_diffusion import DiffusionConfig\n",
-    "from lerobot.common.policies.diffusion.modeling_diffusion import DiffusionPolicy\n",
-    "\n",
-    "# Create a directory to store the training checkpoint.\n",
-    "output_directory = Path(\"outputs/train/example_pusht_diffusion\")\n",
-    "output_directory.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "# Number of offline training steps (we'll only do offline training for this example.)\n",
-    "# Adjust as you prefer. 5000 steps are needed to get something worth evaluating.\n",
-    "training_steps = 5000\n",
-    "device = torch.device(\"cuda\")\n",
-    "log_freq = 250\n",
-    "\n",
-    "# Set up the dataset.\n",
-    "delta_timestamps = {\n",
-    "    # Load the previous image and state at -0.1 seconds before current frame,\n",
-    "    # then load current image and state corresponding to 0.0 second.\n",
-    "    \"observation.image\": [-0.1, 0.0],\n",
-    "    \"observation.state\": [-0.1, 0.0],\n",
-    "    # Load the previous action (-0.1), the next action to be executed (0.0),\n",
-    "    # and 14 future actions with a 0.1 seconds spacing. All these actions will be\n",
-    "    # used to supervise the policy.\n",
-    "    \"action\": [-0.1, 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4],\n",
-    "}\n",
-    "dataset = LeRobotDataset(\"lerobot/pusht\", delta_timestamps=delta_timestamps)\n",
-    "\n",
-    "# Set up the the policy.\n",
-    "# Policies are initialized with a configuration class, in this case `DiffusionConfig`.\n",
-    "# For this example, no arguments need to be passed because the defaults are set up for PushT.\n",
-    "# If you're doing something different, you will likely need to change at least some of the defaults.\n",
-    "# cfg = DiffusionConfig()\n",
-    "# policy = DiffusionPolicy(cfg, dataset_stats=dataset.meta.stats)\n",
-    "# policy.train()\n",
-    "# policy.to(device)\n",
-    "# optimizer = torch.optim.Adam(policy.parameters(), lr=1e-4)\n",
-    "\n",
-    "# Create dataloader for offline training.\n",
-    "dataloader = torch.utils.data.DataLoader(\n",
-    "    dataset,\n",
-    "    num_workers=4,\n",
-    "    batch_size=64,\n",
-    "    shuffle=True,\n",
-    "    pin_memory=device != torch.device(\"cpu\"),\n",
-    "    drop_last=True,\n",
-    ")\n",
-    "\n",
-    "# Since pusht is evaluated via gym, we will use all pre-recorded data for training\n",
-    "# and generate the test/eval data later\n",
-    "\n",
-    "train_ds = dataloader"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "5nwMO3n0X7Hv"
-   },
-   "source": [
-    "Let's write our data collating function. We will apply prompt template to have questions and answers together so model can learn to answer. Then we pass the formatted prompts and images to the processor which processes both."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "e0krVLZ-wNMl"
-   },
-   "outputs": [],
-   "source": [
-    "image_token_id = processor.tokenizer.additional_special_tokens_ids[\n",
-    "            processor.tokenizer.additional_special_tokens.index(\"<image>\")]\n",
-    "\n",
-    "def collate_fn(examples):\n",
-    "  \"\"\"\n",
-    "    Transform lerobot v2.0 formatted pusht dataset samples to smolvlm input format.\n",
-    "    pusht dataset format spec: \n",
-    "        https://huggingface.co/datasets/lerobot/pusht/blob/main/meta/info.json\n",
-    "    smolvlm dataset format spec: \n",
-    "        https://huggingface.co/datasets/merve/vqav2-small based on https://huggingface.co/merve/idefics3llama-vqav2\n",
-    "  \"\"\"\n",
-    "  tasks = task = dataset.meta[\"tasks\"]\n",
-    "  texts = []\n",
-    "  images = []\n",
-    "  for example in examples:\n",
-    "      # image = example[\"image\"]\n",
-    "      # if image.mode != 'RGB':\n",
-    "      #   image = image.convert('RGB')\n",
-    "      # question = example[\"question\"]\n",
-    "      # answer = example[\"multiple_choice_answer\"]\n",
-    "      observation_image = example[\"observation.image\"]\n",
-    "      observation_state = example[\"observation.state\"]\n",
-    "      action = example[\"action\"]\n",
-    "      timestamp = example[\"timestamp\"]\n",
-    "      task = tasks[example[\"task_index\"]]\n",
-    "\n",
-    "      messages = [\n",
-    "          {\n",
-    "              \"role\": \"user\",\n",
-    "              \"content\": [\n",
-    "                  # {\"type\": \"text\", \"text\": \"Answer briefly.\"},\n",
-    "                  # {\"type\": \"text\", \"text\": question},\n",
-    "                  {\"type\": \"text\", \"text\": \"Answer with a sequence of up to 14 robot action states in json format.\"},\n",
-    "                  {\"type\": \"text\", \"text\": task},\n",
-    "                  {\"type\": \"image\"},\n",
-    "                  {\"type\": \"text\", \"text\": previous_action_state},\n",
-    "                  {\"type\": \"image\"},\n",
-    "                  {\"type\": \"text\", \"text\": current_action_state},\n",
-    "                  \n",
-    "              ]\n",
-    "          },\n",
-    "          {\n",
-    "              \"role\": \"assistant\",\n",
-    "              \"content\": [\n",
-    "                  # {\"type\": \"text\", \"text\": answer}\n",
-    "                  {\"type\": \"text\", \"text\": predicted_action_state_sequence}\n",
-    "              ]\n",
-    "          }\n",
-    "      ]\n",
-    "      text = processor.apply_chat_template(messages, add_generation_prompt=False)\n",
-    "      texts.append(text.strip())\n",
-    "      images.append([image])\n",
-    "\n",
-    "  batch = processor(text=texts, images=images, return_tensors=\"pt\", padding=True)\n",
-    "  labels = batch[\"input_ids\"].clone()\n",
-    "  labels[labels == processor.tokenizer.pad_token_id] = -100\n",
-    "  labels[labels == image_token_id] = -100\n",
-    "  batch[\"labels\"] = labels\n",
-    "\n",
-    "  return batch"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "kEYDjWpE3LD5"
-   },
-   "source": [
-    "## Training"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "QvAs896cdwg8"
-   },
-   "source": [
-    "We can now initialize `Trainer` and initialize `TrainingArguments` to pass to `Trainer`.\n",
-    "\n",
-    "Some notes:\n",
-    "- If you use 8-bit QLoRA with the below setup it uses around 16.4 GB VRAM (beautiful, fits comfortably inside L4, Colab free tier)\n",
-    "- We use gradient accumulation to simulate a larger batch size.\n",
-    "- We also save up on memory from intermediate activations by using gradient checkpointing.\n",
-    "\n",
-    "**Disclaimer:**\n",
-    "The techniques here aren't free lunch. The latter two will add additional compute to the training, thus slow down a bit (for reference on two A100s with bsz of 16, we were able to train for 2 hrs 43 mins with the gradient accumulation steps of 4, disabling it reduced it with 2 hr 35 mins).\n",
-    "If you want to speed-up, you might play around, reduce to 4-bit precision and have a higher batch size. Note that 4-bit might result in model learning less."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "QNE2yWAYrAhD"
-   },
-   "outputs": [],
-   "source": [
-    "from transformers import TrainingArguments, Trainer\n",
-    "\n",
-    "model_name = \"smolvlm-lerobot\"\n",
-    "\n",
-    "training_args = TrainingArguments(\n",
-    "    num_train_epochs=1,\n",
-    "    per_device_train_batch_size=4,\n",
-    "    gradient_accumulation_steps=4,\n",
-    "    warmup_steps=50,\n",
-    "    learning_rate=1e-4,\n",
-    "    weight_decay=0.01,\n",
-    "    logging_steps=25,\n",
-    "    save_strategy=\"steps\",\n",
-    "    save_steps=250,\n",
-    "    save_total_limit=1,\n",
-    "    optim=\"paged_adamw_8bit\", # for 8-bit, keep this, else adamw_hf\n",
-    "    bf16=True, # underlying precision for 8bit\n",
-    "    output_dir=f\"./{model_name}-pusht\",\n",
-    "    hub_model_id=f\"{model_name}-pusht\",\n",
-    "    report_to=\"wandb\",\n",
-    "    remove_unused_columns=False,\n",
-    "    gradient_checkpointing=True\n",
-    ")\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "oBBSDpBhreJd",
-    "outputId": "071ed677-1d9f-4f98-9d19-64834440c9c4"
-   },
-   "outputs": [],
-   "source": [
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    args=training_args,\n",
-    "    data_collator=collate_fn,\n",
-    "    train_dataset=train_ds,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "_QOCpw_-uYYo",
-    "outputId": "7abb6937-c072-435a-c3f5-6dbb5b0b9eea"
-   },
-   "outputs": [],
-   "source": [
-    "trainer.train()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "0hN0QD9_uYYo"
-   },
-   "outputs": [],
-   "source": [
-    "trainer.push_to_hub()"
-   ]
-  }
- ],
- "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "A100",
-   "include_colab_link": true,
-   "name": "Smol_VLM_FT.ipynb",
-   "provenance": []
-  },
-  "kernelspec": {
-   "display_name": "zk0",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.15"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/lerobot/common/policies/smolvlm/requirements.txt b/lerobot/common/policies/smolvlm/requirements.txt
deleted file mode 100644
index f4153b89..00000000
--- a/lerobot/common/policies/smolvlm/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-transformers
-trl
-peft
-accelerate
-datasets
-wandb
-bitsandbytes
\ No newline at end of file

From 292037247680e800ed465894febbde58aa6175ac Mon Sep 17 00:00:00 2001
From: ivelin <ivelin117@gmail.com>
Date: Mon, 27 Jan 2025 15:17:19 -0600
Subject: [PATCH 4/5] docs: explain how to determine which cable is
 disconnected

Signed-off-by: ivelin <ivelin117@gmail.com>
---
 examples/10_use_so100.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md
index 57fa832d..be331456 100644
--- a/examples/10_use_so100.md
+++ b/examples/10_use_so100.md
@@ -144,7 +144,7 @@ Follow step 4 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=610). The f
 ### c. Troubleshooting
 
 Sometimes during assembly, the cables connecting the motors or the power adapter cable may be accidentally disconnected. To be sure that the motors are properly connected and functioning after assembly, use the scan_motors tool to test each arm.
-The output should look similar to the example below. If the list of motor IDs is shorter than 6, there is probably a poorly connected cable or a motor is misconfigured.
+The output should look similar to the example below. If the list of motor IDs is shorter than 6, there is probably a poorly connected cable or a motor is misconfigured. For example if only motor IDs [1,3] show up, that indicates that the serial cable between motor 3 and 4 is disconnected.
 
 ```bash
 lerobot$ python lerobot/scripts/scan_motors.py   --port /dev/ttyACM0   --brand feetech   --model sts3215 

From 1a4f0983b7d16e0b65a7185f0ed58421df0a1fa8 Mon Sep 17 00:00:00 2001
From: ivelin <ivelin117@gmail.com>
Date: Tue, 28 Jan 2025 08:33:13 -0600
Subject: [PATCH 5/5] docs: add calibrate troubleshooting sub-section for shaft
 out of position problem

Signed-off-by: ivelin <ivelin117@gmail.com>
---
 examples/10_use_so100.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/examples/10_use_so100.md b/examples/10_use_so100.md
index be331456..7f33860b 100644
--- a/examples/10_use_so100.md
+++ b/examples/10_use_so100.md
@@ -204,7 +204,6 @@ python lerobot/scripts/control_robot.py calibrate \
     --robot-overrides '~cameras' --arms main_follower
 ```
 
-
 #### b. Manual calibration of leader arm
 Follow step 6 of the [assembly video](https://youtu.be/FioA2oeFZ5I?t=724) which illustrates the manual calibration. You will need to move the leader arm to these positions sequentially:
 
@@ -220,6 +219,19 @@ python lerobot/scripts/control_robot.py calibrate \
 ```
 
 
+### c. Troubleshooting
+
+Another known issue during calibration is related to the positioning of the motor shafts. The error message looks like this:
+
+```
+Calibration is done! Saving calibration file '.cache/calibration/moss/main_leader.json'
+Activating torque on main follower arm.
+Wrong motor position range detected for gripper. Expected to be in nominal range of [0, 100] % (a full linear translation), with a maximum range of [-10, 110] % to account for some imprecision during calibration, but present value is 143.7950897216797 %. This might be due to a cable connection issue creating an artificial jump in motor values. You need to recalibrate ...
+```
+
+When all joints are mounted and the robot arm is in resting position (as shown in the calibration photos) each motor shaft should be in approximately middle state ~2048 of its full [0,4096] range. You can use the `scan_motors.py` tool again as shown above to check Present Position for each motor. Motors that are too far off the mid-value in rested arm position need to be repositioned by unscrewing the attached 3D printed part and rotating it such that when screwed back on and returned to resting position, its Present Position reading is about mid-range.
+
+
 ## F. Teleoperate
 
 **Simple teleop**