Port LR Schedulers

2024-05-29 17:46:38 -07:00 · 2024-05-29 17:46:38 -07:00 · 90e6df3ecb
parent aca424a481
commit 90e6df3ecb
3 changed files with 239 additions and 1 deletions
--- a/lerobot/common/policies/lr_schedulers.py
+++ b/lerobot/common/policies/lr_schedulers.py
@ -0,0 +1,166 @@
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""PyTorch learning rate schedulers.
+
+Note: Most of this code was copied as is from the diffusers and transformers libraries with removal of
+certain features for simplication.
+"""
+
+import math
+from enum import Enum
+from functools import partial
+from typing import Optional, Union
+
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+
+
+class SchedulerType(Enum):
+    COSINE = "cosine"
+    INVERSE_SQRT = "inverse_sqrt"
+
+
+def get_cosine_schedule_with_warmup(
+    optimizer: Optimizer,
+    num_warmup_steps: int,
+    num_training_steps: int,
+    num_cycles: float = 0.5,
+    last_epoch: int = -1,
+) -> LambdaLR:
+    """
+    Create a schedule with a learning rate that decreases following the values of the cosine function between the
+    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
+    initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        num_training_steps (`int`):
+            The total number of training steps.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        progress = float(current_step - num_warmup_steps) / float(
+            max(1, num_training_steps - num_warmup_steps)
+        )
+        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))
+
+    return LambdaLR(optimizer, lr_lambda, last_epoch)
+
+
+def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: int = None):
+    if current_step < num_warmup_steps:
+        return float(current_step) / float(max(1, num_warmup_steps))
+    shift = timescale - num_warmup_steps
+    decay = 1.0 / math.sqrt((current_step + shift) / timescale)
+    return decay
+
+
+def get_inverse_sqrt_schedule(
+    optimizer: Optimizer, num_warmup_steps: int, timescale: int = None, last_epoch: int = -1
+):
+    """
+    Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a
+    warmup period which increases lr linearly from 0 to the initial lr set in the optimizer.
+
+    Args:
+        optimizer ([`~torch.optim.Optimizer`]):
+            The optimizer for which to schedule the learning rate.
+        num_warmup_steps (`int`):
+            The number of steps for the warmup phase.
+        timescale (`int`, *optional*, defaults to `num_warmup_steps`):
+            Time scale.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+
+    Return:
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+    """
+    # Note: this implementation is adapted from
+    # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930
+
+    if timescale is None:
+        timescale = num_warmup_steps or 10_000
+
+    lr_lambda = partial(
+        _get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale
+    )
+    return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
+TYPE_TO_SCHEDULER_FUNCTION = {
+    SchedulerType.COSINE: get_cosine_schedule_with_warmup,
+    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
+}
+
+
+def get_scheduler(
+    name: Union[str, SchedulerType],
+    optimizer: Optimizer,
+    num_warmup_steps: Optional[int] = None,
+    num_training_steps: Optional[int] = None,
+    last_epoch: int = -1,
+) -> LambdaLR:
+    """
+    Unified API to get any scheduler from its name.
+
+    Args:
+        name (`str` or `SchedulerType`):
+            The name of the scheduler to use.
+        optimizer (`torch.optim.Optimizer`):
+            The optimizer that will be used during training.
+        num_warmup_steps (`int`, *optional*):
+            The number of warmup steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        num_training_steps (`int``, *optional*):
+            The number of training steps to do. This is not required by all schedulers (hence the argument being
+            optional), the function will raise an error if it's unset and the scheduler type requires it.
+        last_epoch (`int`, *optional*, defaults to -1):
+            The index of the last epoch when resuming training.
+    """
+    name = SchedulerType(name)
+    if name not in TYPE_TO_SCHEDULER_FUNCTION:
+        raise ValueError(
+            f"Unsupported scheduler {name}, expected one of {list(TYPE_TO_SCHEDULER_FUNCTION.keys())}"
+        )
+
+    schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
+
+    # All other schedulers require `num_warmup_steps`
+    if num_warmup_steps is None:
+        raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
+
+    # All other schedulers require `num_training_steps`
+    if num_training_steps is None:
+        raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
+
+    if name == SchedulerType.INVERSE_SQRT:
+        return schedule_func(optimizer, num_warmup_steps=num_warmup_steps)
+
+    return schedule_func(
+        optimizer,
+        num_warmup_steps=num_warmup_steps,
+        num_training_steps=num_training_steps,
+        last_epoch=last_epoch,
+    )
--- a/lerobot/scripts/train.py
+++ b/lerobot/scripts/train.py
@ -69,7 +69,7 @@ def make_optimizer_and_scheduler(cfg, policy):
            cfg.training.adam_eps,
            cfg.training.adam_weight_decay,
        )
-        from diffusers.optimization import get_scheduler
+        from transformers.optimization import get_scheduler

        lr_scheduler = get_scheduler(
            cfg.training.lr_scheduler,
--- a/tests/test_lr_schedulers.py
+++ b/tests/test_lr_schedulers.py
@ -0,0 +1,72 @@
+import math
+
+import pytest
+import torch
+
+from lerobot.common.policies.lr_schedulers import get_scheduler
+
+
+def test_get_lr_scheduler():
+    optimizer = torch.optim.AdamW(torch.nn.Linear(10, 10).parameters(), lr=1e-4)
+
+    lr_scheduler = get_scheduler("cosine", optimizer, num_warmup_steps=500, num_training_steps=2000)
+    assert lr_scheduler is not None
+    assert lr_scheduler.__class__.__name__ == "LambdaLR"
+
+    lr_scheduler = get_scheduler("inverse_sqrt", optimizer, num_warmup_steps=500, num_training_steps=2000)
+    assert lr_scheduler is not None
+    assert lr_scheduler.__class__.__name__ == "LambdaLR"
+
+    with pytest.raises(ValueError):
+        get_scheduler("invalid", 100, 1000)
+
+
+def test_cosine_lr_scheduler():
+    intervals = 250
+    num_warmup_steps = 500
+    num_training_steps = 2000
+    recorded_lrs_at_intervals = [2.0e-7, 5.0e-5, 1.0e-4, 9.3e-5, 7.5e-5, 5.0e-5, 2.5e-5, 6.6e-6]
+    optimizer = torch.optim.AdamW(
+        torch.nn.Linear(10, 10).parameters(), lr=1e-4, betas=(0.95, 0.999), eps=1e-8, weight_decay=1e-6
+    )
+
+    lr_scheduler = get_scheduler(
+        "cosine", optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
+    )
+    assert lr_scheduler.get_last_lr()[0] == 0.0
+
+    for i in range(num_training_steps):
+        optimizer.step()
+        lr_scheduler.step()
+        if i % intervals == 0:
+            recorded = recorded_lrs_at_intervals.pop(0)
+            assert math.isclose(
+                lr_scheduler.get_last_lr()[0], recorded
+            ), f"LR value mismatch at step {i}: {lr_scheduler.get_last_lr()[0]} vs. {recorded}"
+
+    assert lr_scheduler.get_last_lr()[0] == recorded_lrs_at_intervals.pop(0)
+
+
+def test_inverse_sqrt_lr_scheduler():
+    intervals = 250
+    num_warmup_steps = 500
+    num_training_steps = 2000
+    recorded_lrs_at_intervals = [2.0e-7, 5.0e-5, 1.0e-4, 8.2e-5, 7.1e-5, 6.3e-5, 5.8e-5, 5.3e-5]
+    optimizer = torch.optim.AdamW(
+        torch.nn.Linear(10, 10).parameters(), lr=1e-4, betas=(0.95, 0.999), eps=1e-8, weight_decay=1e-6
+    )
+
+    lr_scheduler = get_scheduler(
+        "inverse_sqrt", optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
+    )
+
+    for i in range(num_training_steps):
+        optimizer.step()
+        lr_scheduler.step()
+        if i % intervals == 0:
+            recorded = recorded_lrs_at_intervals.pop(0)
+            assert math.isclose(
+                lr_scheduler.get_last_lr()[0], recorded
+            ), f"LR value mismatch at step {i}: {lr_scheduler.get_last_lr()[0]} vs. {recorded}"
+
+    assert lr_scheduler.get_last_lr()[0] == recorded_lrs_at_intervals.pop(0)