DL-Art-School/codes/trainer/lr_scheduler.py

import math
from collections import Counter
from collections import defaultdict
import torch
from torch.optim.lr_scheduler import _LRScheduler
import torch_intermediary as ml

from utils.util import opt_get

def get_scheduler_for_name(name, optimizers, scheduler_opt):
    schedulers = []
    for o in optimizers:
        # Hack to support LARC, which wraps an underlying optimizer.
        if hasattr(o, 'optim'):
            o = o.optim

        if name == 'MultiStepLR':
            sched = MultiStepLR_Restart(o, scheduler_opt['gen_lr_steps'],
                                             restarts=scheduler_opt['restarts'],
                                             weights=scheduler_opt['restart_weights'],
                                             gamma=scheduler_opt['lr_gamma'],
                                             clear_state=scheduler_opt['clear_state'],
                                             force_lr=scheduler_opt['force_lr'],
                                             warmup_steps=opt_get(scheduler_opt, ['warmup_steps'], 0))
        elif name == 'ProgressiveMultiStepLR':
            sched = ProgressiveMultiStepLR(o, scheduler_opt['gen_lr_steps'],
                                             scheduler_opt['progressive_starts'],
                                             scheduler_opt['lr_gamma'])
        elif name == 'CosineAnnealingLR_Restart':
            sched = CosineAnnealingLR_Restart(
                        o, scheduler_opt['T_period'], scheduler_opt['warmup'], eta_min=scheduler_opt['eta_min'],
                        restarts=scheduler_opt['restarts'], weights=scheduler_opt['restart_weights'])
        else:
            raise NotImplementedError('Scheduler not available')
        schedulers.append(sched)
    return schedulers


# This scheduler is specifically designed to modulate the learning rate of several different param groups configured
# by a generator or discriminator that slowly adds new stages one at a time, e.g. like progressive growing of GANs.
class ProgressiveMultiStepLR(_LRScheduler):
    def __init__(self, optimizer, milestones, group_starts, gamma=0.1):
        self.milestones = Counter(milestones)
        self.gamma = gamma
        self.group_starts = group_starts
        super(ProgressiveMultiStepLR, self).__init__(optimizer)

    def get_lr(self):
        group_lrs = []
        assert len(self.optimizer.param_groups) == len(self.group_starts)
        for group, group_start in zip(self.optimizer.param_groups, self.group_starts):
            if self.last_epoch - group_start not in self.milestones:
                group_lrs.append(group['lr'])
            else:
                group_lrs.append(group['lr'] * self.gamma)
        return group_lrs


class MultiStepLR_Restart(_LRScheduler):
    def __init__(self, optimizer, milestones, restarts=None, weights=None, gamma=0.1,
                 clear_state=False, force_lr=False, last_epoch=-1, warmup_steps=0):
        self.milestones = Counter(milestones)
        self.gamma = gamma
        self.clear_state = clear_state
        self.restarts = restarts if restarts else [0]
        self.restarts = [v + 1 for v in self.restarts]
        self.restart_weights = weights if weights else [1]
        self.force_lr = force_lr
        if force_lr:
            print(f"!!Forcing the learning rate to: {force_lr}")
        self.warmup_steps = warmup_steps
        assert len(self.restarts) == len(
            self.restart_weights), 'restarts and their weights do not match.'
        super(MultiStepLR_Restart, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        # Note to self: for the purposes of this trainer, "last_epoch" should read "last_step"
        if self.force_lr is not None:
            return [self.force_lr for _ in self.optimizer.param_groups]
        if self.last_epoch in self.restarts:
            if self.clear_state:
                self.optimizer.state = defaultdict(dict)
            weight = self.restart_weights[self.restarts.index(self.last_epoch)]
            return [group['initial_lr'] * weight for group in self.optimizer.param_groups]
        if self.last_epoch < self.warmup_steps:
            factor = 1 - (self.warmup_steps - self.last_epoch) / self.warmup_steps
            return [group['initial_lr'] * factor for group in self.optimizer.param_groups]
        if self.last_epoch not in self.milestones:
            return [group['lr'] for group in self.optimizer.param_groups]
        return [
            group['lr'] * self.gamma**self.milestones[self.last_epoch]
            for group in self.optimizer.param_groups
        ]

    # Allow this scheduler to use newly appointed milestones partially through a training run..
    def load_state_dict(self, s):
        milestones_cache = self.milestones
        force_lr_cache = self.force_lr
        super(MultiStepLR_Restart, self).load_state_dict(s)
        self.milestones = milestones_cache
        self.force_lr = force_lr_cache


class CosineAnnealingLR_Restart(_LRScheduler):
    def __init__(self, optimizer, T_period, warmup=0, restarts=None, weights=None, eta_min=0, last_epoch=-1):
        self.warmup = warmup
        self.T_period = T_period
        self.T_max = self.T_period[0]  # current T period
        self.eta_min = eta_min
        self.restarts = restarts if restarts else [0]
        self.restarts = [v + 1 for v in self.restarts]
        self.restart_weights = weights if weights else [1]
        self.last_restart = 0
        assert len(self.restarts) == len(
            self.restart_weights), 'restarts and their weights do not match.'
        super(CosineAnnealingLR_Restart, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        step = self.last_epoch - self.warmup
        if step <= 0:
            return self.base_lrs
        elif step in self.restarts:
            self.last_restart = step
            self.T_max = self.T_period[self.restarts.index(step) + 1]
            weight = self.restart_weights[self.restarts.index(step)]
            return [group['initial_lr'] * weight for group in self.optimizer.param_groups]
        elif (step - self.last_restart - 1 - self.T_max) % (2 * self.T_max) == 0:
            return [
                group['lr'] + (base_lr - self.eta_min) * (1 - math.cos(math.pi / self.T_max)) / 2
                for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)
            ]
        return [(1 + math.cos(math.pi * (step - self.last_restart) / self.T_max)) /
                (1 + math.cos(math.pi * ((step - self.last_restart) - 1) / self.T_max)) *
                (group['lr'] - self.eta_min) + self.eta_min
                for group in self.optimizer.param_groups]


if __name__ == "__main__":
    #torch.optim.Adam
    optimizer = ml.Adam([torch.zeros(3, 64, 3, 3)], lr=1e-4, weight_decay=0,
                                 betas=(0.9, 0.99))
    ##############################
    # MultiStepLR_Restart
    ##############################
    ## Original
    lr_steps = [200000, 400000, 600000, 800000]
    restarts = None
    restart_weights = None

    ## two
    lr_steps = [100000, 200000, 300000, 400000, 490000, 600000, 700000, 800000, 900000, 990000]
    restarts = [500000]
    restart_weights = [1]

    ## four
    lr_steps = [
        50000, 100000, 150000, 200000, 240000, 300000, 350000, 400000, 450000, 490000, 550000,
        600000, 650000, 700000, 740000, 800000, 850000, 900000, 950000, 990000
    ]
    restarts = [250000, 500000, 750000]
    restart_weights = [1, 1, 1]

    scheduler = MultiStepLR_Restart(optimizer, lr_steps, restarts, restart_weights, gamma=0.5,
                                    clear_state=False, warmup_steps=20000)
    '''
    ##############################
    # Cosine Annealing Restart
    ##############################
    ## two
    T_period = [500000, 500000]
    restarts = [500000]
    restart_weights = [1]

    ## four
    T_period = [200000, 100000, 200000]
    restarts = [200000, 300000]
    restart_weights = [.5, .25]

    scheduler = CosineAnnealingLR_Restart(optimizer, T_period, warmup=10000, eta_min=1e-8, restarts=restarts,
                                          weights=restart_weights)
    '''

    ##############################
    # Draw figure
    ##############################
    N_iter = 100000
    lr_l = list(range(N_iter))
    for i in range(N_iter):
        scheduler.step()
        current_lr = optimizer.param_groups[0]['lr']
        lr_l[i] = current_lr

    import matplotlib as mpl
    from matplotlib import pyplot as plt
    import matplotlib.ticker as mtick
    mpl.style.use('default')
    import seaborn
    seaborn.set(style='whitegrid')
    seaborn.set_context('paper')

    plt.figure(1)
    plt.subplot(111)
    plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
    plt.title('Title', fontsize=16, color='k')
    plt.plot(list(range(N_iter)), lr_l, linewidth=1.5, label='learning rate scheme')
    legend = plt.legend(loc='upper right', shadow=False)
    ax = plt.gca()
    labels = ax.get_xticks().tolist()
    for k, v in enumerate(labels):
        labels[k] = str(int(v / 1000)) + 'K'
    ax.set_xticklabels(labels)
    ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1e'))

    ax.set_ylabel('Learning rate')
    ax.set_xlabel('Iteration')
    fig = plt.gcf()
    plt.show()
mmsr 2019-08-23 13:42:47 +00:00			`import math`
			`from collections import Counter`
			`from collections import defaultdict`
			`import torch`
			`from torch.optim.lr_scheduler import _LRScheduler`
I sucked off the hyptothetical wizard again, just using BNB's ADAM optimizer nets HUGE savings, but I don't know the output costs, will need to test 2023-02-23 02:42:17 +00:00			`import torch_intermediary as ml`
mmsr 2019-08-23 13:42:47 +00:00
Mods to support contrastive learning on audio files 2021-08-05 11:57:04 +00:00			`from utils.util import opt_get`

Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00			`def get_scheduler_for_name(name, optimizers, scheduler_opt):`
			`schedulers = []`
			`for o in optimizers:`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`# Hack to support LARC, which wraps an underlying optimizer.`
			`if hasattr(o, 'optim'):`
			`o = o.optim`

Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00			`if name == 'MultiStepLR':`
			`sched = MultiStepLR_Restart(o, scheduler_opt['gen_lr_steps'],`
			`restarts=scheduler_opt['restarts'],`
			`weights=scheduler_opt['restart_weights'],`
			`gamma=scheduler_opt['lr_gamma'],`
			`clear_state=scheduler_opt['clear_state'],`
It works! 2021-08-05 02:07:45 +00:00			`force_lr=scheduler_opt['force_lr'],`
Mods to support contrastive learning on audio files 2021-08-05 11:57:04 +00:00			`warmup_steps=opt_get(scheduler_opt, ['warmup_steps'], 0))`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00			`elif name == 'ProgressiveMultiStepLR':`
			`sched = ProgressiveMultiStepLR(o, scheduler_opt['gen_lr_steps'],`
			`scheduler_opt['progressive_starts'],`
			`scheduler_opt['lr_gamma'])`
			`elif name == 'CosineAnnealingLR_Restart':`
			`sched = CosineAnnealingLR_Restart(`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`o, scheduler_opt['T_period'], scheduler_opt['warmup'], eta_min=scheduler_opt['eta_min'],`
Extensible trainer (in progress) 2020-08-12 14:45:23 +00:00			`restarts=scheduler_opt['restarts'], weights=scheduler_opt['restart_weights'])`
			`else:`
			`raise NotImplementedError('Scheduler not available')`
			`schedulers.append(sched)`
			`return schedulers`


Huge set of mods to support progressive generator growth 2020-07-18 20:18:48 +00:00			`# This scheduler is specifically designed to modulate the learning rate of several different param groups configured`
			`# by a generator or discriminator that slowly adds new stages one at a time, e.g. like progressive growing of GANs.`
			`class ProgressiveMultiStepLR(_LRScheduler):`
			`def __init__(self, optimizer, milestones, group_starts, gamma=0.1):`
			`self.milestones = Counter(milestones)`
			`self.gamma = gamma`
			`self.group_starts = group_starts`
			`super(ProgressiveMultiStepLR, self).__init__(optimizer)`

			`def get_lr(self):`
			`group_lrs = []`
			`assert len(self.optimizer.param_groups) == len(self.group_starts)`
			`for group, group_start in zip(self.optimizer.param_groups, self.group_starts):`
			`if self.last_epoch - group_start not in self.milestones:`
			`group_lrs.append(group['lr'])`
			`else:`
			`group_lrs.append(group['lr'] * self.gamma)`
			`return group_lrs`


mmsr 2019-08-23 13:42:47 +00:00			`class MultiStepLR_Restart(_LRScheduler):`
			`def __init__(self, optimizer, milestones, restarts=None, weights=None, gamma=0.1,`
It works! 2021-08-05 02:07:45 +00:00			`clear_state=False, force_lr=False, last_epoch=-1, warmup_steps=0):`
mmsr 2019-08-23 13:42:47 +00:00			`self.milestones = Counter(milestones)`
			`self.gamma = gamma`
			`self.clear_state = clear_state`
			`self.restarts = restarts if restarts else [0]`
			`self.restarts = [v + 1 for v in self.restarts]`
			`self.restart_weights = weights if weights else [1]`
Enable forced learning rates 2020-06-07 22:56:05 +00:00			`self.force_lr = force_lr`
Fix force_lr logic 2021-10-21 17:51:30 +00:00			`if force_lr:`
			`print(f"!!Forcing the learning rate to: {force_lr}")`
It works! 2021-08-05 02:07:45 +00:00			`self.warmup_steps = warmup_steps`
mmsr 2019-08-23 13:42:47 +00:00			`assert len(self.restarts) == len(`
			`self.restart_weights), 'restarts and their weights do not match.'`
			`super(MultiStepLR_Restart, self).__init__(optimizer, last_epoch)`

			`def get_lr(self):`
It works! 2021-08-05 02:07:45 +00:00			`# Note to self: for the purposes of this trainer, "last_epoch" should read "last_step"`
Fix force_lr logic 2021-10-21 17:51:30 +00:00			`if self.force_lr is not None:`
Force LR fix 2021-10-21 18:01:01 +00:00			`return [self.force_lr for _ in self.optimizer.param_groups]`
mmsr 2019-08-23 13:42:47 +00:00			`if self.last_epoch in self.restarts:`
			`if self.clear_state:`
			`self.optimizer.state = defaultdict(dict)`
			`weight = self.restart_weights[self.restarts.index(self.last_epoch)]`
			`return [group['initial_lr'] * weight for group in self.optimizer.param_groups]`
It works! 2021-08-05 02:07:45 +00:00			`if self.last_epoch < self.warmup_steps:`
			`factor = 1 - (self.warmup_steps - self.last_epoch) / self.warmup_steps`
			`return [group['initial_lr'] * factor for group in self.optimizer.param_groups]`
mmsr 2019-08-23 13:42:47 +00:00			`if self.last_epoch not in self.milestones:`
			`return [group['lr'] for group in self.optimizer.param_groups]`
			`return [`
			`group['lr'] * self.gamma**self.milestones[self.last_epoch]`
			`for group in self.optimizer.param_groups`
			`]`

Allow multi_step_lr_scheduler to load a new LR schedule when restoring state 2020-07-31 17:21:11 +00:00			`# Allow this scheduler to use newly appointed milestones partially through a training run..`
			`def load_state_dict(self, s):`
			`milestones_cache = self.milestones`
Force LR fix 2021-10-21 18:01:01 +00:00			`force_lr_cache = self.force_lr`
Allow multi_step_lr_scheduler to load a new LR schedule when restoring state 2020-07-31 17:21:11 +00:00			`super(MultiStepLR_Restart, self).load_state_dict(s)`
			`self.milestones = milestones_cache`
Force LR fix 2021-10-21 18:01:01 +00:00			`self.force_lr = force_lr_cache`
Allow multi_step_lr_scheduler to load a new LR schedule when restoring state 2020-07-31 17:21:11 +00:00
mmsr 2019-08-23 13:42:47 +00:00
			`class CosineAnnealingLR_Restart(_LRScheduler):`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`def __init__(self, optimizer, T_period, warmup=0, restarts=None, weights=None, eta_min=0, last_epoch=-1):`
			`self.warmup = warmup`
mmsr 2019-08-23 13:42:47 +00:00			`self.T_period = T_period`
			`self.T_max = self.T_period[0] # current T period`
			`self.eta_min = eta_min`
			`self.restarts = restarts if restarts else [0]`
			`self.restarts = [v + 1 for v in self.restarts]`
			`self.restart_weights = weights if weights else [1]`
			`self.last_restart = 0`
			`assert len(self.restarts) == len(`
			`self.restart_weights), 'restarts and their weights do not match.'`
			`super(CosineAnnealingLR_Restart, self).__init__(optimizer, last_epoch)`

			`def get_lr(self):`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`step = self.last_epoch - self.warmup`
			`if step <= 0:`
mmsr 2019-08-23 13:42:47 +00:00			`return self.base_lrs`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`elif step in self.restarts:`
			`self.last_restart = step`
			`self.T_max = self.T_period[self.restarts.index(step) + 1]`
			`weight = self.restart_weights[self.restarts.index(step)]`
mmsr 2019-08-23 13:42:47 +00:00			`return [group['initial_lr'] * weight for group in self.optimizer.param_groups]`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`elif (step - self.last_restart - 1 - self.T_max) % (2 * self.T_max) == 0:`
mmsr 2019-08-23 13:42:47 +00:00			`return [`
			`group['lr'] + (base_lr - self.eta_min) * (1 - math.cos(math.pi / self.T_max)) / 2`
			`for base_lr, group in zip(self.base_lrs, self.optimizer.param_groups)`
			`]`
Add LARS optimizer & support for BYOL idiosyncrasies - Added LARS and SGD optimizer variants that support turning off certain features for BN and bias layers - Added a variant of pytorch's resnet model that supports gradient checkpointing. - Modify the trainer infrastructure to support above - Fix bug with BYOL (should have been nonfunctional) 2020-12-24 03:33:43 +00:00			`return [(1 + math.cos(math.pi * (step - self.last_restart) / self.T_max)) /`
			`(1 + math.cos(math.pi * ((step - self.last_restart) - 1) / self.T_max)) *`
mmsr 2019-08-23 13:42:47 +00:00			`(group['lr'] - self.eta_min) + self.eta_min`
			`for group in self.optimizer.param_groups]`


			`if __name__ == "__main__":`
initial conversion (errors out) 2023-02-22 23:07:05 +00:00			`#torch.optim.Adam`
I sucked off the hyptothetical wizard again, just using BNB's ADAM optimizer nets HUGE savings, but I don't know the output costs, will need to test 2023-02-23 02:42:17 +00:00			`optimizer = ml.Adam([torch.zeros(3, 64, 3, 3)], lr=1e-4, weight_decay=0,`
mmsr 2019-08-23 13:42:47 +00:00			`betas=(0.9, 0.99))`
			`##############################`
			`# MultiStepLR_Restart`
			`##############################`
			`## Original`
			`lr_steps = [200000, 400000, 600000, 800000]`
			`restarts = None`
			`restart_weights = None`

			`## two`
			`lr_steps = [100000, 200000, 300000, 400000, 490000, 600000, 700000, 800000, 900000, 990000]`
			`restarts = [500000]`
			`restart_weights = [1]`

			`## four`
			`lr_steps = [`
			`50000, 100000, 150000, 200000, 240000, 300000, 350000, 400000, 450000, 490000, 550000,`
			`600000, 650000, 700000, 740000, 800000, 850000, 900000, 950000, 990000`
			`]`
			`restarts = [250000, 500000, 750000]`
			`restart_weights = [1, 1, 1]`

			`scheduler = MultiStepLR_Restart(optimizer, lr_steps, restarts, restart_weights, gamma=0.5,`
It works! 2021-08-05 02:07:45 +00:00			`clear_state=False, warmup_steps=20000)`
			`'''`
mmsr 2019-08-23 13:42:47 +00:00			`##############################`
			`# Cosine Annealing Restart`
			`##############################`
			`## two`
			`T_period = [500000, 500000]`
			`restarts = [500000]`
			`restart_weights = [1]`

			`## four`
resnet_unet_3 I'm being really lazy here - these nets are not really different from each other except at which layer they terminate. This one terminates at 2x downsampling, which is simply indicative of a direction I want to go for testing these pixpro networks. 2021-01-15 21:51:03 +00:00			`T_period = [200000, 100000, 200000]`
			`restarts = [200000, 300000]`
			`restart_weights = [.5, .25]`
mmsr 2019-08-23 13:42:47 +00:00
resnet_unet_3 I'm being really lazy here - these nets are not really different from each other except at which layer they terminate. This one terminates at 2x downsampling, which is simply indicative of a direction I want to go for testing these pixpro networks. 2021-01-15 21:51:03 +00:00			`scheduler = CosineAnnealingLR_Restart(optimizer, T_period, warmup=10000, eta_min=1e-8, restarts=restarts,`
mmsr 2019-08-23 13:42:47 +00:00			`weights=restart_weights)`
It works! 2021-08-05 02:07:45 +00:00			`'''`
mmsr 2019-08-23 13:42:47 +00:00
			`##############################`
			`# Draw figure`
			`##############################`
It works! 2021-08-05 02:07:45 +00:00			`N_iter = 100000`
mmsr 2019-08-23 13:42:47 +00:00			`lr_l = list(range(N_iter))`
			`for i in range(N_iter):`
			`scheduler.step()`
			`current_lr = optimizer.param_groups[0]['lr']`
			`lr_l[i] = current_lr`

			`import matplotlib as mpl`
			`from matplotlib import pyplot as plt`
			`import matplotlib.ticker as mtick`
			`mpl.style.use('default')`
			`import seaborn`
			`seaborn.set(style='whitegrid')`
			`seaborn.set_context('paper')`

			`plt.figure(1)`
			`plt.subplot(111)`
			`plt.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))`
			`plt.title('Title', fontsize=16, color='k')`
			`plt.plot(list(range(N_iter)), lr_l, linewidth=1.5, label='learning rate scheme')`
			`legend = plt.legend(loc='upper right', shadow=False)`
			`ax = plt.gca()`
			`labels = ax.get_xticks().tolist()`
			`for k, v in enumerate(labels):`
			`labels[k] = str(int(v / 1000)) + 'K'`
			`ax.set_xticklabels(labels)`
			`ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1e'))`

			`ax.set_ylabel('Learning rate')`
			`ax.set_xlabel('Iteration')`
			`fig = plt.gcf()`
			`plt.show()`