tl;dr: pytorch的 torch.optim.lr_scheduler.OneCycleLR
就很不错,能兼顾warmup和余弦学习率,也不用下载额外的包
import torch
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts, StepLR
import torch.nn as nn
import matplotlib.pyplot as plt
from timm import scheduler as timm_scheduler
from torch.optim import Optimizer
from torch.optim import lr_scheduler
from torch.optim.lr_scheduler import _LRScheduler
import timm.scheduler
import math
from transformers import get_cosine_schedule_with_warmup
model = torch.nn.Linear(10, 1)
lr = 1# 32*4.5e-6
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
max_epoch = 10
steps_per_epoch = 10
mode = 'oneCycle'
current_epoch = 0
match mode:
case 'cosineAnn':
steps_per_epoch = 1
scheduler = CosineAnnealingLR(optimizer, T_max=10, eta_min=4.5e-6)
case 'cosineAnnWarm':
'''
以T_0=5, T_mult=1为例:
T_0:学习率第一次回到初始值的epoch位置.
T_mult:这个控制了学习率回升的速度
- 如果T_mult=1,则学习率在T_0,2*T_0,3*T_0,....,i*T_0,....处回到最大值(初始学习率)
- 5,10,15,20,25,.......处回到最大值
- 如果T_mult>1,则学习率在T_0,(1+T_mult)*T_0,(1+T_mult+T_mult**2)*T_0,.....,(1+T_mult+T_mult**2+...+T_0**i)*T0,处回到最大值
- 5,15,35,75,155,.......处回到最大值
example:
T_0=5, T_mult=1
'''
scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=8, T_mult=1)
case 'cosineTimm':
steps_per_epoch = 1
scheduler = timm_scheduler.CosineLRScheduler(optimizer=optimizer, t_initial=max_epoch, lr_min=4.5e-6, warmup_t=1, warmup_lr_init=4.5e-6)
scheduler.step = lambda: scheduler.step(current_epoch)
case 'cosineTorchLambda':
warmup_epoch = 2
warmup_factor = 1e-3
steps_per_epoch = 1
def f(current_epoch):
"""
:current_epoch epoch或者iteration
:return 根据step数返回一个学习率倍率因子
注意在训练开始之前,pytorch似乎会提前调用一次lr_scheduler.step()方法
"""
if current_epoch <= warmup_epoch:
alpha = float(current_epoch) / (warmup_epoch)
# warmup过程中lr倍率因子大小从warmup_factor -> 1
return warmup_factor * (1 - alpha) + alpha # 对于alpha的一个线性变换,alpha是关于x的一个反比例函数变化
else:
# warmup后lr的倍率因子从1 -> 0
# 参考deeplab_v2: Learning rate policy
return (1 - (current_epoch - warmup_epoch) / (max_epoch - warmup_epoch)) ** 0.9
# (1-a/b)^0.9 b是当前这个epoch结束训练总共了多少次了(除去warmup),这个关系是指一个epcoch中
scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=f)
case 'step':
steps_per_epoch = 1
scheduler = lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.5)
case 'oneCycle':
steps_per_epoch = 10
scheduler = lr_scheduler.OneCycleLR(optimizer, max_lr=lr, epochs=max_epoch, steps_per_epoch=steps_per_epoch, pct_start=0.1, final_div_factor=10)
case 'cosineTransformers':
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=steps_per_epoch, num_training_steps=max_epoch*steps_per_epoch)
plt.figure()
# iters = 200
lr_history = []
for epoch in range(max_epoch):
for step in range(steps_per_epoch):
optimizer.step()
scheduler.step()
current_lr = optimizer.param_groups[0]['lr']
lr_history.append(current_lr)
print(lr_history)
plt.plot(range(len(lr_history)), lr_history)
plt.xlabel('Epoch')
plt.ylabel('Learning Rate')
plt.title('Cosine Annealing with Warmup')
plt.show()