Sometimes a fixed learning rate is not the best way to train a model. In these cases, using a different learning rate schedule might work better.

Fixed

This is what usually happens when you just set a learning rate in an optimizer without configuring a learning rate schedule. Each batch and each epoch ends up using the same learning rate, from start to finish.

def get_learning_rate(iteration: int, num_iterations: int) -> float:
    return 0.0003

Cosine

class CosineLRScheduler(keras.callbacks.Callback):
    def __init__(self, total_steps: int, lr_max: float, lr_min: float):
        super().__init__()
        self.total_steps = total_steps
        self.lr_max = lr_max
        self.lr_min = lr_min
        self.n = 0

    def on_batch_begin(self, batch, logs=None):
        self.model.optimizer.learning_rate = cosine_lr(self.n, self.total_steps, self.lr_max, self.lr_min)
        self.n += 1


def cosine_lr(step: int, total_steps: int, lr_max: float, lr_min: float):
    if step >= total_steps: return lr_min
    return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(np.pi * step / total_steps))

Cosine with linear warmup

def get_lr(it: int, warmup_iters: int, max_iters: int) -> float:
    # 1) linear warmup for warmup_iters steps
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # 2) if it > max_iters, return min learning rate
    if it > max_iters:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_iters) / (max_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff ranges 0..1
    return min_lr + coeff * (learning_rate - min_lr)

Tags: Machine learning Neural network

Learning rate scheduler

Fixed

Cosine

Cosine with linear warmup

Citation

Comments

Table of contents

Search

More links