import unittest
import functools
from copy import deepcopy
+from bisect import bisect_right
import torch
from torch._six import inf
import torch.optim as optim
from torch.optim import SGD
from torch.autograd import Variable
from torch import sparse
-from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau
+from torch.optim.lr_scheduler import LambdaLR, StepLR, MultiStepLR, \
+ ExponentialLR, CosineAnnealingLR, ReduceLROnPlateau, _LRScheduler
from common_utils import TestCase, run_tests, TEST_WITH_UBSAN, skipIfRocm, load_tests
# load_tests from common_utils is used to automatically filter tests for
class TestOptim(TestCase):
- def _test_rosenbrock_sparse(self, constructor, sparse_only=False):
+ def _test_rosenbrock_sparse(self, constructor, scheduler_constructors=None,
+ sparse_only=False):
+ if scheduler_constructors is None:
+ scheduler_constructors = []
params_t = torch.Tensor([1.5, 1.5])
params = Variable(params_t, requires_grad=True)
optimizer = constructor([params])
+ schedulers = []
+ for scheduler_constructor in scheduler_constructors:
+ schedulers.append(scheduler_constructor(optimizer))
if not sparse_only:
params_c = Variable(params_t.clone(), requires_grad=True)
# Do cyclic coordinate descent
w = i % 2
optimizer.step(functools.partial(eval, params, True, w))
+ for scheduler in schedulers:
+ if isinstance(scheduler, ReduceLROnPlateau):
+ scheduler.step(rosenbrock(params))
+ else:
+ scheduler.step()
if not sparse_only:
optimizer_c.step(functools.partial(eval, params_c, False, w))
self.assertEqual(params.data, params_c.data)
self.assertLessEqual(params.data.dist(solution), initial_dist)
- def _test_basic_cases_template(self, weight, bias, input, constructor):
+ def _test_basic_cases_template(self, weight, bias, input, constructor, scheduler_constructors):
weight = Variable(weight, requires_grad=True)
bias = Variable(bias, requires_grad=True)
input = Variable(input)
optimizer = constructor(weight, bias)
+ schedulers = []
+ for scheduler_constructor in scheduler_constructors:
+ schedulers.append(scheduler_constructor(optimizer))
# to check if the optimizer can be printed as a string
optimizer.__repr__()
return loss
initial_value = fn().item()
- for i in range(200):
+ for _i in range(200):
+ for scheduler in schedulers:
+ if isinstance(scheduler, ReduceLROnPlateau):
+ val_loss = fn()
+ scheduler.step(val_loss)
+ else:
+ scheduler.step()
optimizer.step(fn)
self.assertLess(fn().item(), initial_value)
fn = functools.partial(fn_base, optimizer, weight, bias)
# Prime the optimizer
- for i in range(20):
+ for _i in range(20):
optimizer.step(fn)
# Clone the weights and construct new optimizer for them
weight_c = Variable(weight.data.clone(), requires_grad=True)
state_dict_c = deepcopy(optimizer.state_dict())
optimizer_c.load_state_dict(state_dict_c)
# Run both optimizations in parallel
- for i in range(20):
+ for _i in range(20):
optimizer.step(fn)
optimizer_c.step(fn_c)
self.assertEqual(weight, weight_c)
# Make sure state dict wasn't modified
self.assertEqual(state_dict, state_dict_c)
- for i in range(20):
+ for _i in range(20):
optimizer.step(fn)
optimizer_cuda.step(fn_cuda)
self.assertEqual(weight, weight_cuda)
self.assertEqual(bias, bias_cuda)
- def _test_basic_cases(self, constructor, ignore_multidevice=False):
+ def _test_basic_cases(self, constructor, scheduler_constructors=None,
+ ignore_multidevice=False):
+ if scheduler_constructors is None:
+ scheduler_constructors = []
self._test_state_dict(
torch.randn(10, 5),
torch.randn(10),
torch.randn(10, 5),
torch.randn(10),
torch.randn(5),
- constructor
+ constructor,
+ scheduler_constructors
)
# non-contiguous parameters
self._test_basic_cases_template(
torch.randn(10, 5, 2)[..., 0],
torch.randn(10, 2)[..., 0],
torch.randn(5),
- constructor
+ constructor,
+ scheduler_constructors
)
# CUDA
if not torch.cuda.is_available():
torch.randn(10, 5).cuda(),
torch.randn(10).cuda(),
torch.randn(5).cuda(),
- constructor
+ constructor,
+ scheduler_constructors
)
# Multi-GPU
if not torch.cuda.device_count() > 1 or ignore_multidevice:
torch.randn(10, 5).cuda(0),
torch.randn(10).cuda(1),
torch.randn(5).cuda(0),
- constructor
+ constructor,
+ scheduler_constructors
)
def _build_params_dict(self, weight, bias, **kwargs):
- return [dict(params=[weight]), dict(params=[bias], **kwargs)]
+ return [{'params': [weight]}, dict(params=[bias], **kwargs)]
def _build_params_dict_single(self, weight, bias, **kwargs):
return [dict(params=bias, **kwargs)]
lambda weight, bias: optim.SGD(
self._build_params_dict_single(weight, bias, lr=1e-2))
)
+ self._test_basic_cases(
+ lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
+ [lambda opt: StepLR(opt, gamma=0.9, step_size=10)]
+ )
+ self._test_basic_cases(
+ lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
+ [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+ lambda opt: ReduceLROnPlateau(opt)]
+ )
+ self._test_basic_cases(
+ lambda weight, bias: optim.SGD([weight, bias], lr=1e-3),
+ [lambda opt: StepLR(opt, gamma=0.99, step_size=10),
+ lambda opt: ExponentialLR(opt, gamma=0.99),
+ lambda opt: ReduceLROnPlateau(opt)]
+ )
with self.assertRaisesRegex(ValueError, "Invalid momentum value: -0.5"):
optim.SGD(None, lr=1e-2, momentum=-0.5)
self._test_rosenbrock_sparse(
lambda params: optim.SGD(params, lr=5e-3)
)
+ self._test_rosenbrock_sparse(
+ lambda params: optim.SGD(params, lr=0.005),
+ [lambda opt: StepLR(opt, gamma=0.99999, step_size=300)]
+ )
def test_adam(self):
self._test_basic_cases(
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-3, amsgrad=True)
)
+ self._test_basic_cases(
+ lambda weight, bias: optim.Adam(
+ self._build_params_dict(weight, bias, lr=1e-2),
+ lr=1e-3),
+ [lambda opt: ExponentialLR(opt, gamma=0.9)]
+ )
+ self._test_basic_cases(
+ lambda weight, bias: optim.Adam([weight, bias], lr=1e-3,
+ amsgrad=True),
+ [lambda opt: ExponentialLR(opt, gamma=0.9),
+ lambda opt: ReduceLROnPlateau(opt)]
+ )
+ self._test_basic_cases(
+ lambda weight, bias: optim.Adam(
+ self._build_params_dict(weight, bias, lr=1e-2),
+ lr=1e-3, amsgrad=True),
+ [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+ lambda opt: ReduceLROnPlateau(opt)]
+ )
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
optim.Adam(None, lr=1e-2, betas=(1.0, 0.0))
def test_sparse_adam(self):
self._test_rosenbrock_sparse(
lambda params: optim.SparseAdam(params, lr=4e-2),
+ [],
True
)
with self.assertRaisesRegex(ValueError, "Invalid beta parameter at index 0: 1.0"):
lambda weight, bias: optim.Adadelta(
self._build_params_dict(weight, bias, rho=0.95))
)
+ self._test_basic_cases(
+ lambda weight, bias: optim.Adadelta(
+ self._build_params_dict(weight, bias, rho=0.95)),
+ [lambda opt: StepLR(opt, gamma=0.9, step_size=10),
+ lambda opt: ReduceLROnPlateau(opt)]
+ )
with self.assertRaisesRegex(ValueError, "Invalid rho value: 1.1"):
optim.Adadelta(None, lr=1e-2, rho=1.1)
self._build_params_dict(weight, bias, lr=1e-2),
lr=1e-1)
)
+ self._test_basic_cases(
+ lambda weight, bias: optim.Adagrad(
+ self._build_params_dict(weight, bias, lr=1e-2),
+ lr=1e-1),
+ [lambda opt: ReduceLROnPlateau(opt)]
+ )
+ self._test_basic_cases(
+ lambda weight, bias: optim.Adagrad(
+ self._build_params_dict(weight, bias, lr=1e-2),
+ lr=1e-1),
+ [lambda opt: ReduceLROnPlateau(opt),
+ lambda opt: ExponentialLR(opt, gamma=0.99)]
+ )
with self.assertRaisesRegex(ValueError, "Invalid lr_decay value: -0.5"):
optim.Adagrad(None, lr=1e-2, lr_decay=-0.5)
self._test_rosenbrock_sparse(
lambda params: optim.Adagrad(params, lr=1e-1)
)
+ self._test_rosenbrock_sparse(
+ lambda params: optim.Adagrad(params, lr=0.1),
+ [lambda opt: StepLR(opt, gamma=1 - 1e-5, step_size=500),
+ lambda opt: ReduceLROnPlateau(opt, threshold=1e-4)]
+ )
@skipIfRocm
def test_adamax(self):
return False
+class LegacyStepLR(StepLR):
+ def get_lr(self):
+ return [base_lr * self.gamma ** (self.last_epoch // self.step_size)
+ for base_lr in self.base_lrs]
+
+
+class LegacyMultiStepLR(MultiStepLR):
+ def __init__(self, optimizer, milestones, gamma=0.1, last_epoch=-1):
+ self.milestones = sorted(milestones)
+ self.gamma = gamma
+ super(MultiStepLR, self).__init__(optimizer, last_epoch)
+
+ def get_lr(self):
+ return [base_lr * self.gamma ** bisect_right(self.milestones, self.last_epoch)
+ for base_lr in self.base_lrs]
+
+
+class LegacyExponentialLR(ExponentialLR):
+ def get_lr(self):
+ return [base_lr * self.gamma ** self.last_epoch
+ for base_lr in self.base_lrs]
+
+
+class LegacyCosineAnnealingLR(CosineAnnealingLR):
+ def get_lr(self):
+ return [self.eta_min + (base_lr - self.eta_min) *
+ (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
+ for base_lr in self.base_lrs]
+
+
class TestLRScheduler(TestCase):
def setUp(self):
self.net = SchedulerTestNet()
scheduler = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
self._test(scheduler, targets, epochs)
+ def test_legacy_step_lr(self):
+ scheduler = StepLR(self.opt, gamma=0.1, step_size=3)
+ legacy_scheduler = LegacyStepLR(self.opt, gamma=0.1, step_size=3)
+ self._test_against_legacy(scheduler, legacy_scheduler, 20)
+
+ def test_legacy_multi_step_lr(self):
+ scheduler = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+ legacy_scheduler = LegacyMultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+ self._test_against_legacy(scheduler, legacy_scheduler, 20)
+
+ def test_legacy_exp_lr(self):
+ scheduler = ExponentialLR(self.opt, gamma=0.9)
+ legacy_scheduler = LegacyExponentialLR(self.opt, gamma=0.9)
+ self._test_against_legacy(scheduler, legacy_scheduler, 20)
+
+ def test_legacy_cos_anneal_lr(self):
+ eta_min = 1e-10
+ epochs = 20
+ scheduler = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+ legacy_scheduler = LegacyCosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+ self._test_against_legacy(scheduler, legacy_scheduler, epochs)
+
def test_reduce_lr_on_plateau1(self):
epochs = 10
for param_group in self.opt.param_groups:
threshold=0.1, patience=5, cooldown=5)
self._test_reduce_lr_on_plateau(scheduler, targets, metrics, epochs)
+ def test_compound_step_and_multistep_lr(self):
+ epochs = 10
+ schedulers = [None] * 2
+ schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
+ schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+ targets = [[0.05] * 2 + [0.005] * 1 + [5e-4] * 2 + [5e-5] + [5e-6] * 3 + [5e-8]]
+ self._test(schedulers, targets, epochs)
+
+ def test_compound_step_and_exp_lr(self):
+ epochs = 10
+ schedulers = [None] * 2
+ single_targets = [0.05 * (0.9 ** x) for x in range(3)]
+ single_targets += [0.005 * (0.9 ** x) for x in range(3, 6)]
+ single_targets += [0.0005 * (0.9 ** x) for x in range(6, 9)]
+ single_targets += [0.00005 * (0.9 ** x) for x in range(9, 12)]
+ targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+ schedulers[0] = StepLR(self.opt, gamma=0.1, step_size=3)
+ schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
+ self._test(schedulers, targets, epochs)
+
+ def test_compound_exp_and_multistep_lr(self):
+ epochs = 10
+ schedulers = [None] * 2
+ single_targets = [0.05 * (0.9 ** x) for x in range(2)]
+ single_targets += [0.005 * (0.9 ** x) for x in range(2, 5)]
+ single_targets += [0.0005 * (0.9 ** x) for x in range(5, 9)]
+ single_targets += [0.00005 * (0.9 ** x) for x in range(9, 11)]
+ targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+ schedulers[0] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+ schedulers[1] = ExponentialLR(self.opt, gamma=0.9)
+ self._test(schedulers, targets, epochs)
+
+ def test_compound_cosanneal_and_step_lr(self):
+ epochs = 10
+ eta_min = 1e-10
+ single_targets = [eta_min + (0.05 - eta_min) *
+ (1 + math.cos(math.pi * x / epochs)) / 2
+ for x in range(epochs)]
+ single_targets = [x * 0.1 ** (i // 3) for i, x in enumerate(single_targets)]
+ targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+ schedulers = [None] * 2
+ schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+ schedulers[1] = StepLR(self.opt, gamma=0.1, step_size=3)
+ self._test(schedulers, targets, epochs)
+
+ def test_compound_cosanneal_and_multistep_lr(self):
+ epochs = 10
+ eta_min = 1e-10
+ single_targets = [eta_min + (0.05 - eta_min) *
+ (1 + math.cos(math.pi * x / epochs)) / 2
+ for x in range(epochs)]
+ multipliers = [1] * 2 + [0.1] * 3 + [0.01] * 4 + [0.001]
+ single_targets = [x * y for x, y in zip(single_targets, multipliers)]
+ targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+ schedulers = [None] * 2
+ schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+ schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[2, 5, 9])
+ self._test(schedulers, targets, epochs)
+
+ def test_compound_cosanneal_and_exp_lr(self):
+ epochs = 10
+ eta_min = 1e-10
+ single_targets = [eta_min + (0.05 - eta_min) *
+ (1 + math.cos(math.pi * x / epochs)) / 2
+ for x in range(epochs)]
+ multipliers = [0.1 ** i for i in range(epochs)]
+ single_targets = [x * y for x, y in zip(single_targets, multipliers)]
+ targets = [single_targets, list(map(lambda x: x * epochs, single_targets))]
+ schedulers = [None] * 2
+ schedulers[0] = CosineAnnealingLR(self.opt, T_max=epochs, eta_min=eta_min)
+ schedulers[1] = ExponentialLR(self.opt, gamma=0.1)
+ self._test(schedulers, targets, epochs)
+
+ def test_compound_reduce_lr_on_plateau1(self):
+ epochs = 10
+ for param_group in self.opt.param_groups:
+ param_group['lr'] = 0.5
+ single_targets = [0.5] * 20
+ multipliers = [0.1 ** (i // 3) for i in range(20)]
+ single_targets = [x * y for x, y in zip(multipliers, single_targets)]
+ targets = [single_targets]
+ metrics = [10 - i * 0.0167 for i in range(20)]
+ schedulers = [None, None]
+ schedulers[0] = ReduceLROnPlateau(self.opt, threshold_mode='abs', mode='min',
+ threshold=0.01, patience=5, cooldown=5)
+ schedulers[1] = StepLR(self.opt, gamma=0.1, step_size=3)
+ self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
+ def test_compound_reduce_lr_on_plateau2(self):
+ epochs = 22
+ for param_group in self.opt.param_groups:
+ param_group['lr'] = 0.5
+ single_targets = [0.5] * 6 + [0.05] * 7 + [0.005] * 7 + [0.0005] * 2
+ multipliers = [1] * 3 + [0.1] * 5 + [0.01] * 4 + [0.001] * 10
+ single_targets = [x * y for x, y in zip(single_targets, multipliers)]
+ targets = [single_targets]
+ metrics = [10 - i * 0.0165 for i in range(22)]
+ schedulers = [None] * 2
+ schedulers[0] = ReduceLROnPlateau(self.opt, patience=5, cooldown=0, threshold_mode='abs',
+ mode='min', threshold=0.1)
+ schedulers[1] = MultiStepLR(self.opt, gamma=0.1, milestones=[3, 8, 12])
+ self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
+ def test_compound_reduce_lr_on_plateau3(self):
+ epochs = 22
+ for param_group in self.opt.param_groups:
+ param_group['lr'] = 0.5
+ single_targets = [0.5] * (2 + 6) + [0.05] * (5 + 6) + [0.005] * 4
+ multipliers = [0.1 ** i for i in range(epochs)]
+ single_targets = [x * y for x, y in zip(multipliers, single_targets)]
+ targets = [single_targets]
+ metrics = [-0.8] * 2 + [-0.234] * 20
+ schedulers = [None, None]
+ schedulers[0] = ReduceLROnPlateau(self.opt, mode='max', patience=5, cooldown=5,
+ threshold_mode='abs')
+ schedulers[1] = ExponentialLR(self.opt, gamma=0.1)
+ self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
+ def test_compound_reduce_lr_on_plateau4(self):
+ epochs = 20
+ for param_group in self.opt.param_groups:
+ param_group['lr'] = 0.05
+ epochs = 10
+ eta_min = 1e-10
+ single_targets = [eta_min + (0.05 - eta_min) *
+ (1 + math.cos(math.pi * x / epochs)) / 2
+ for x in range(epochs)]
+ targets = [single_targets]
+ metrics = [1.5 * (1.025 ** i) for i in range(20)] # 1.025 > 1.1**0.25
+ schedulers = [None, None]
+ schedulers[0] = ReduceLROnPlateau(self.opt, mode='max', patience=3,
+ threshold_mode='rel', threshold=0.1)
+ schedulers[1] = CosineAnnealingLR(self.opt, epochs, eta_min)
+ self._test_reduce_lr_on_plateau(schedulers, targets, metrics, epochs)
+
def test_lambda_lr(self):
epochs = 10
self.opt.param_groups[0]['lr'] = 0.05
self.assertAlmostEqual(scheduler.__dict__[key], scheduler_copy.__dict__[key])
self.assertAlmostEqual(scheduler.get_lr(), scheduler_copy.get_lr())
- def _test(self, scheduler, targets, epochs=10):
+ def _test(self, schedulers, targets, epochs=10):
+ if isinstance(schedulers, _LRScheduler):
+ schedulers = [schedulers]
for epoch in range(epochs):
- scheduler.step(epoch)
+ [scheduler.step(epoch) for scheduler in schedulers]
for param_group, target in zip(self.opt.param_groups, targets):
self.assertAlmostEqual(target[epoch], param_group['lr'],
msg='LR is wrong in epoch {}: expected {}, got {}'.format(
epoch, target[epoch], param_group['lr']), delta=1e-5)
- def _test_reduce_lr_on_plateau(self, scheduler, targets, metrics, epochs=10, verbose=False):
+ def _test_against_legacy(self, scheduler, legacy_scheduler, epochs=10):
+ self.setUp()
+ targets = []
+ for epoch in range(epochs):
+ legacy_scheduler.step(epoch)
+ targets.append([group['lr'] for group in self.opt.param_groups])
+ self.setUp()
for epoch in range(epochs):
- scheduler.step(metrics[epoch])
+ scheduler.step(epoch)
+ for i, param_group in enumerate(self.opt.param_groups):
+ self.assertAlmostEqual(targets[epoch][i], param_group['lr'],
+ msg='LR is wrong in epoch {}: expected {}, got {}'.format(
+ epoch, targets[epoch][i], param_group['lr']), delta=1e-5)
+
+ def _test_reduce_lr_on_plateau(self, schedulers, targets, metrics, epochs=10, verbose=False):
+ if isinstance(schedulers, _LRScheduler) or isinstance(schedulers, ReduceLROnPlateau):
+ schedulers = [schedulers]
+ for epoch in range(epochs):
+ for scheduler in schedulers:
+ if isinstance(scheduler, ReduceLROnPlateau):
+ scheduler.step(metrics[epoch])
+ else:
+ scheduler.step(epoch)
if verbose:
print('epoch{}:\tlr={}'.format(epoch, self.opt.param_groups[0]['lr']))
for param_group, target in zip(self.opt.param_groups, targets):
msg='LR is wrong in epoch {}: expected {}, got {}'.format(
epoch, target[epoch], param_group['lr']), delta=1e-5)
-
if __name__ == '__main__':
run_tests()