DL-Art-School/codes/train.py

import os
import math
import argparse
import random
import logging
import shutil
from tqdm import tqdm

import torch
from data.data_sampler import DistIterSampler

import options.options as option
from utils import util
from data import create_dataloader, create_dataset
from models import create_model
from time import time


def init_dist(backend='nccl', **kwargs):
    # These packages have globals that screw with Windows, so only import them if needed.
    import torch.distributed as dist
    import torch.multiprocessing as mp

    """initialization for distributed training"""
    if mp.get_start_method(allow_none=True) != 'spawn':
        mp.set_start_method('spawn')
    rank = int(os.environ['RANK'])
    num_gpus = torch.cuda.device_count()
    torch.cuda.set_device(rank % num_gpus)
    dist.init_process_group(backend=backend, **kwargs)

def main():
    #### options
    parser = argparse.ArgumentParser()
    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_exd_imgset_ssgr.yml')
    parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    opt = option.parse(args.opt, is_train=True)

    colab_mode = False if 'colab_mode' not in opt.keys() else opt['colab_mode']
    if colab_mode:
        # Check the configuration of the remote server. Expect models, resume_state, and val_images directories to be there.
        # Each one should have a TEST file in it.
        util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],
                                   os.path.join(opt['remote_path'], 'training_state', "TEST"))
        util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],
                                   os.path.join(opt['remote_path'], 'models', "TEST"))
        util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],
                                   os.path.join(opt['remote_path'], 'val_images', "TEST"))
        # Load the state and models needed from the remote server.
        if opt['path']['resume_state']:
            util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'], os.path.join(opt['remote_path'], 'training_state', opt['path']['resume_state']))
        if opt['path']['pretrain_model_G']:
            util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'], os.path.join(opt['remote_path'], 'models', opt['path']['pretrain_model_G']))
        if opt['path']['pretrain_model_D']:
            util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'], os.path.join(opt['remote_path'], 'models', opt['path']['pretrain_model_D']))

    #### distributed training settings
    if args.launcher == 'none':  # disabled distributed training
        opt['dist'] = False
        rank = -1
        print('Disabled distributed training.')
        if torch.cuda.device_count() > 1:
            gpu = input('I noticed you have multiple GPUs. Starting two jobs on the same GPU sucks. Please confirm which GPU'
                  'you want to use. Press enter to use the specified one [%i]' % (opt['gpu_ids']))
            if gpu:
                opt['gpu_ids'] = [int(gpu)]

    else:
        opt['dist'] = True
        init_dist()
        world_size = torch.distributed.get_world_size()
        rank = torch.distributed.get_rank()

    #### loading resume state if exists
    if opt['path'].get('resume_state', None):
        # distributed resuming: all load into default GPU
        device_id = torch.cuda.current_device()
        resume_state = torch.load(opt['path']['resume_state'],
                                  map_location=lambda storage, loc: storage.cuda(device_id))
        option.check_resume(opt, resume_state['iter'])  # check resume options
    else:
        resume_state = None

    #### mkdir and loggers
    if rank <= 0:  # normal training (rank -1) OR distributed training (rank 0)
        if resume_state is None:
            util.mkdir_and_rename(
                opt['path']['experiments_root'])  # rename experiment folder if exists
            util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and path is not None
                         and 'pretrain_model' not in key and 'resume' not in key))

        # config loggers. Before it, the log will not work
        util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO,
                          screen=True, tofile=True)
        logger = logging.getLogger('base')
        logger.info(option.dict2str(opt))
        # tensorboard logger
        if opt['use_tb_logger'] and 'debug' not in opt['name']:
            tb_logger_path = os.path.join(opt['path']['experiments_root'], 'tb_logger')
            version = float(torch.__version__[0:3])
            if version >= 1.1:  # PyTorch 1.1
                from torch.utils.tensorboard import SummaryWriter
            else:
                logger.info(
                    'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version))
                from tensorboardX import SummaryWriter
            tb_logger = SummaryWriter(log_dir=tb_logger_path)
    else:
        util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True)
        logger = logging.getLogger('base')

    # convert to NoneDict, which returns None for missing keys
    opt = option.dict_to_nonedict(opt)

    #### random seed
    seed = opt['train']['manual_seed']
    if seed is None:
        seed = random.randint(1, 10000)
    if rank <= 0:
        logger.info('Random seed: {}'.format(seed))
    util.set_random_seed(seed)

    torch.backends.cudnn.benchmark = True
    # torch.backends.cudnn.deterministic = True
    # torch.autograd.set_detect_anomaly(True)

    #### create train and val dataloader
    dataset_ratio = 1  # enlarge the size of each epoch
    for phase, dataset_opt in opt['datasets'].items():
        if phase == 'train':
            train_set = create_dataset(dataset_opt)
            train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size']))
            total_iters = int(opt['train']['niter'])
            total_epochs = int(math.ceil(total_iters / train_size))
            if opt['dist']:
                train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio)
                total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio)))
            else:
                train_sampler = None
            train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler)
            if rank <= 0:
                logger.info('Number of train images: {:,d}, iters: {:,d}'.format(
                    len(train_set), train_size))
                logger.info('Total epochs needed: {:d} for iters {:,d}'.format(
                    total_epochs, total_iters))
        elif phase == 'val':
            val_set = create_dataset(dataset_opt)
            val_loader = create_dataloader(val_set, dataset_opt, opt, None)
            if rank <= 0:
                logger.info('Number of val images in [{:s}]: {:d}'.format(
                    dataset_opt['name'], len(val_set)))
        else:
            raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase))
    assert train_loader is not None

    #### create model
    model = create_model(opt)

    #### resume training
    if resume_state:
        logger.info('Resuming training from epoch: {}, iter: {}.'.format(
            resume_state['epoch'], resume_state['iter']))

        start_epoch = resume_state['epoch']
        current_step = resume_state['iter']
        model.resume_training(resume_state, 'amp_opt_level' in opt.keys())  # handle optimizers and schedulers
    else:
        current_step = -1 if 'start_step' not in opt.keys() else opt['start_step']
        start_epoch = 0

    #### training
    logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step))
    for epoch in range(start_epoch, total_epochs + 1):
        if opt['dist']:
            train_sampler.set_epoch(epoch)
        tq_ldr = tqdm(train_loader)

        _t = time()
        _profile = False
        for train_data in tq_ldr:
            if _profile:
                print("Data fetch: %f" % (time() - _t))
                _t = time()

            #tb_logger.add_graph(model.netsG['generator'].module, [train_data['LQ'].to('cuda'),
            #                                                      train_data['lq_fullsize_ref'].float().to('cuda'),
            #                                                      train_data['lq_center'].to('cuda')])

            current_step += 1
            if current_step > total_iters:
                break
            #### update learning rate
            model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter'])

            #### training
            if _profile:
                print("Update LR: %f" % (time() - _t))
                _t = time()
            model.feed_data(train_data)
            model.optimize_parameters(current_step)
            if _profile:
                print("Model feed + step: %f" % (time() - _t))
                _t = time()

            #### log
            if current_step % opt['logger']['print_freq'] == 0:
                logs = model.get_current_log(current_step)
                message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format(epoch, current_step)
                for v in model.get_current_learning_rate():
                    message += '{:.3e},'.format(v)
                message += ')] '
                for k, v in logs.items():
                    if 'histogram' in k:
                        if rank <= 0:
                            tb_logger.add_histogram(k, v, current_step)
                    else:
                        message += '{:s}: {:.4e} '.format(k, v)
                        # tensorboard logger
                        if opt['use_tb_logger'] and 'debug' not in opt['name']:
                            if rank <= 0:
                                tb_logger.add_scalar(k, v, current_step)
                if rank <= 0:
                    logger.info(message)

            #### save models and training states
            if current_step % opt['logger']['save_checkpoint_freq'] == 0:
                if rank <= 0:
                    logger.info('Saving models and training states.')
                    model.save(current_step)
                    model.save_training_state(epoch, current_step)
                if 'alt_path' in opt['path'].keys():
                    import shutil
                    print("Synchronizing tb_logger to alt_path..")
                    alt_tblogger = os.path.join(opt['path']['alt_path'], "tb_logger")
                    shutil.rmtree(alt_tblogger, ignore_errors=True)
                    shutil.copytree(tb_logger_path, alt_tblogger)

            #### validation
            if opt['datasets'].get('val', None) and current_step % opt['train']['val_freq'] == 0:
                if opt['model'] in ['sr', 'srgan', 'corruptgan', 'spsrgan', 'extensibletrainer'] and rank <= 0:  # image restoration validation
                    model.force_restore_swapout()
                    val_batch_sz = 1 if 'batch_size' not in opt['datasets']['val'].keys() else opt['datasets']['val']['batch_size']
                    # does not support multi-GPU validation
                    pbar = util.ProgressBar(len(val_loader) * val_batch_sz)
                    avg_psnr = 0.
                    avg_fea_loss = 0.
                    idx = 0
                    colab_imgs_to_copy = []
                    for val_data in val_loader:
                        idx += 1
                        for b in range(len(val_data['LQ_path'])):
                            img_name = os.path.splitext(os.path.basename(val_data['LQ_path'][b]))[0]
                            img_dir = os.path.join(opt['path']['val_images'], img_name)
                            util.mkdir(img_dir)

                            model.feed_data(val_data)
                            model.test()

                            visuals = model.get_current_visuals()
                            if visuals is None:
                                continue

                            sr_img = util.tensor2img(visuals['rlt'][b])  # uint8
                            #gt_img = util.tensor2img(visuals['GT'][b])  # uint8

                            # Save SR images for reference
                            img_base_name = '{:s}_{:d}.png'.format(img_name, current_step)
                            save_img_path = os.path.join(img_dir, img_base_name)
                            util.save_img(sr_img, save_img_path)
                            if colab_mode:
                                colab_imgs_to_copy.append(save_img_path)

                            # calculate PSNR (Naw - don't do that. PSNR sucks)
                            #sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale'])
                            #avg_psnr += util.calculate_psnr(sr_img, gt_img)
                            #pbar.update('Test {}'.format(img_name))

                            # calculate fea loss
                            avg_fea_loss += model.compute_fea_loss(visuals['rlt'][b], visuals['GT'][b])

                    if colab_mode:
                        util.copy_files_to_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],
                                                  colab_imgs_to_copy,
                                                  os.path.join(opt['remote_path'], 'val_images', img_base_name))

                    avg_psnr = avg_psnr / idx
                    avg_fea_loss = avg_fea_loss / idx

                    # log
                    logger.info('# Validation # PSNR: {:.4e} Fea: {:.4e}'.format(avg_psnr, avg_fea_loss))
                    # tensorboard logger
                    if opt['use_tb_logger'] and 'debug' not in opt['name']:
                        #tb_logger.add_scalar('val_psnr', avg_psnr, current_step)
                        tb_logger.add_scalar('val_fea', avg_fea_loss, current_step)

    if rank <= 0:
        logger.info('Saving the final model.')
        model.save('latest')
        logger.info('End of training.')
        tb_logger.close()


if __name__ == '__main__':
    main()
mmsr 2019-08-23 13:42:47 +00:00			`import os`
			`import math`
			`import argparse`
			`import random`
			`import logging`
Clear out tensorboard on job restart. 2020-04-30 17:30:55 +00:00			`import shutil`
Some random fixes/adjustments 2020-04-22 06:38:53 +00:00			`from tqdm import tqdm`
mmsr 2019-08-23 13:42:47 +00:00
			`import torch`
			`from data.data_sampler import DistIterSampler`

			`import options.options as option`
			`from utils import util`
			`from data import create_dataloader, create_dataset`
			`from models import create_model`
Misc 2020-06-18 17:28:55 +00:00			`from time import time`
mmsr 2019-08-23 13:42:47 +00:00

			`def init_dist(backend='nccl', **kwargs):`
Move train imports into init_dist 2020-07-02 21:11:21 +00:00			`# These packages have globals that screw with Windows, so only import them if needed.`
			`import torch.distributed as dist`
			`import torch.multiprocessing as mp`

mmsr 2019-08-23 13:42:47 +00:00			`"""initialization for distributed training"""`
			`if mp.get_start_method(allow_none=True) != 'spawn':`
			`mp.set_start_method('spawn')`
			`rank = int(os.environ['RANK'])`
			`num_gpus = torch.cuda.device_count()`
			`torch.cuda.set_device(rank % num_gpus)`
			`dist.init_process_group(backend=backend, **kwargs)`

			`def main():`
			`#### options`
			`parser = argparse.ArgumentParser()`
Prompt user for gpu_id if multiple gpus are detected 2020-10-01 23:24:50 +00:00			`parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_exd_imgset_ssgr.yml')`
More dataset integration work 2020-09-26 04:19:38 +00:00			`parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')`
mmsr 2019-08-23 13:42:47 +00:00			`parser.add_argument('--local_rank', type=int, default=0)`
			`args = parser.parse_args()`
			`opt = option.parse(args.opt, is_train=True)`

Introduce (untested) colab mode 2020-06-01 21:09:52 +00:00			`colab_mode = False if 'colab_mode' not in opt.keys() else opt['colab_mode']`
			`if colab_mode:`
			`# Check the configuration of the remote server. Expect models, resume_state, and val_images directories to be there.`
			`# Each one should have a TEST file in it.`
			`util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],`
			`os.path.join(opt['remote_path'], 'training_state', "TEST"))`
			`util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],`
			`os.path.join(opt['remote_path'], 'models', "TEST"))`
			`util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],`
			`os.path.join(opt['remote_path'], 'val_images', "TEST"))`
			`# Load the state and models needed from the remote server.`
			`if opt['path']['resume_state']:`
			`util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'], os.path.join(opt['remote_path'], 'training_state', opt['path']['resume_state']))`
			`if opt['path']['pretrain_model_G']:`
			`util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'], os.path.join(opt['remote_path'], 'models', opt['path']['pretrain_model_G']))`
			`if opt['path']['pretrain_model_D']:`
			`util.get_files_from_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'], os.path.join(opt['remote_path'], 'models', opt['path']['pretrain_model_D']))`

mmsr 2019-08-23 13:42:47 +00:00			`#### distributed training settings`
			`if args.launcher == 'none': # disabled distributed training`
			`opt['dist'] = False`
			`rank = -1`
			`print('Disabled distributed training.')`
Prompt user for gpu_id if multiple gpus are detected 2020-10-01 23:24:50 +00:00			`if torch.cuda.device_count() > 1:`
			`gpu = input('I noticed you have multiple GPUs. Starting two jobs on the same GPU sucks. Please confirm which GPU'`
			`'you want to use. Press enter to use the specified one [%i]' % (opt['gpu_ids']))`
			`if gpu:`
			`opt['gpu_ids'] = [int(gpu)]`

mmsr 2019-08-23 13:42:47 +00:00			`else:`
			`opt['dist'] = True`
			`init_dist()`
			`world_size = torch.distributed.get_world_size()`
			`rank = torch.distributed.get_rank()`

			`#### loading resume state if exists`
			`if opt['path'].get('resume_state', None):`
			`# distributed resuming: all load into default GPU`
			`device_id = torch.cuda.current_device()`
			`resume_state = torch.load(opt['path']['resume_state'],`
			`map_location=lambda storage, loc: storage.cuda(device_id))`
			`option.check_resume(opt, resume_state['iter']) # check resume options`
			`else:`
			`resume_state = None`

			`#### mkdir and loggers`
			`if rank <= 0: # normal training (rank -1) OR distributed training (rank 0)`
			`if resume_state is None:`
			`util.mkdir_and_rename(`
			`opt['path']['experiments_root']) # rename experiment folder if exists`
More dataset integration work 2020-09-26 04:19:38 +00:00			`util.mkdirs((path for key, path in opt['path'].items() if not key == 'experiments_root' and path is not None`
mmsr 2019-08-23 13:42:47 +00:00			`and 'pretrain_model' not in key and 'resume' not in key))`

			`# config loggers. Before it, the log will not work`
			`util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO,`
			`screen=True, tofile=True)`
			`logger = logging.getLogger('base')`
			`logger.info(option.dict2str(opt))`
			`# tensorboard logger`
			`if opt['use_tb_logger'] and 'debug' not in opt['name']:`
Log tensorboard directly into experiments directory 2020-06-18 17:33:02 +00:00			`tb_logger_path = os.path.join(opt['path']['experiments_root'], 'tb_logger')`
mmsr 2019-08-23 13:42:47 +00:00			`version = float(torch.__version__[0:3])`
			`if version >= 1.1: # PyTorch 1.1`
			`from torch.utils.tensorboard import SummaryWriter`
			`else:`
			`logger.info(`
			`'You are using PyTorch {}. Tensorboard will use [tensorboardX]'.format(version))`
			`from tensorboardX import SummaryWriter`
Clear out tensorboard on job restart. 2020-04-30 17:30:55 +00:00			`tb_logger = SummaryWriter(log_dir=tb_logger_path)`
mmsr 2019-08-23 13:42:47 +00:00			`else:`
			`util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True)`
			`logger = logging.getLogger('base')`

			`# convert to NoneDict, which returns None for missing keys`
			`opt = option.dict_to_nonedict(opt)`

			`#### random seed`
			`seed = opt['train']['manual_seed']`
			`if seed is None:`
			`seed = random.randint(1, 10000)`
			`if rank <= 0:`
			`logger.info('Random seed: {}'.format(seed))`
			`util.set_random_seed(seed)`

			`torch.backends.cudnn.benchmark = True`
			`# torch.backends.cudnn.deterministic = True`
Fixes to the spsr3 Some lessons learned: - Biases are fairly important as a relief valve. They dont need to be everywhere, but most computationally heavy branches should have a bias. - GroupNorm in SPSR is not a great idea. Since image gradients are represented in this model, normal means and standard deviations are not applicable. (imggrad has a high representation of 0). - Don't fuck with the mainline of any generative model. As much as possible, all additions should be done through residual connections. Never pollute the mainline with reference data, do that in branches. It basically leaves the mode untrainable. 2020-09-09 21:28:14 +00:00			`# torch.autograd.set_detect_anomaly(True)`
mmsr 2019-08-23 13:42:47 +00:00
			`#### create train and val dataloader`
Fix distributed launch for large distributed runs 2020-08-25 21:42:59 +00:00			`dataset_ratio = 1 # enlarge the size of each epoch`
mmsr 2019-08-23 13:42:47 +00:00			`for phase, dataset_opt in opt['datasets'].items():`
			`if phase == 'train':`
			`train_set = create_dataset(dataset_opt)`
			`train_size = int(math.ceil(len(train_set) / dataset_opt['batch_size']))`
			`total_iters = int(opt['train']['niter'])`
			`total_epochs = int(math.ceil(total_iters / train_size))`
			`if opt['dist']:`
			`train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio)`
			`total_epochs = int(math.ceil(total_iters / (train_size * dataset_ratio)))`
			`else:`
			`train_sampler = None`
			`train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler)`
			`if rank <= 0:`
			`logger.info('Number of train images: {:,d}, iters: {:,d}'.format(`
			`len(train_set), train_size))`
			`logger.info('Total epochs needed: {:d} for iters {:,d}'.format(`
			`total_epochs, total_iters))`
			`elif phase == 'val':`
			`val_set = create_dataset(dataset_opt)`
			`val_loader = create_dataloader(val_set, dataset_opt, opt, None)`
			`if rank <= 0:`
			`logger.info('Number of val images in [{:s}]: {:d}'.format(`
			`dataset_opt['name'], len(val_set)))`
			`else:`
			`raise NotImplementedError('Phase [{:s}] is not recognized.'.format(phase))`
			`assert train_loader is not None`

			`#### create model`
			`model = create_model(opt)`

			`#### resume training`
			`if resume_state:`
			`logger.info('Resuming training from epoch: {}, iter: {}.'.format(`
			`resume_state['epoch'], resume_state['iter']))`

			`start_epoch = resume_state['epoch']`
			`current_step = resume_state['iter']`
Misc changes 2020-09-16 02:57:59 +00:00			`model.resume_training(resume_state, 'amp_opt_level' in opt.keys()) # handle optimizers and schedulers`
mmsr 2019-08-23 13:42:47 +00:00			`else:`
Enable start_step to be specified 2020-08-16 00:34:59 +00:00			`current_step = -1 if 'start_step' not in opt.keys() else opt['start_step']`
mmsr 2019-08-23 13:42:47 +00:00			`start_epoch = 0`

			`#### training`
			`logger.info('Start training from epoch: {:d}, iter: {:d}'.format(start_epoch, current_step))`
			`for epoch in range(start_epoch, total_epochs + 1):`
			`if opt['dist']:`
			`train_sampler.set_epoch(epoch)`
Some random fixes/adjustments 2020-04-22 06:38:53 +00:00			`tq_ldr = tqdm(train_loader)`
Misc 2020-06-18 17:28:55 +00:00
			`_t = time()`
			`_profile = False`
Fixes to the spsr3 Some lessons learned: - Biases are fairly important as a relief valve. They dont need to be everywhere, but most computationally heavy branches should have a bias. - GroupNorm in SPSR is not a great idea. Since image gradients are represented in this model, normal means and standard deviations are not applicable. (imggrad has a high representation of 0). - Don't fuck with the mainline of any generative model. As much as possible, all additions should be done through residual connections. Never pollute the mainline with reference data, do that in branches. It basically leaves the mode untrainable. 2020-09-09 21:28:14 +00:00			`for train_data in tq_ldr:`
Misc 2020-06-18 17:28:55 +00:00			`if _profile:`
			`print("Data fetch: %f" % (time() - _t))`
			`_t = time()`

Save models before validation Validation often fails with OOM, wasting hours of training time. Save models first. 2020-09-16 14:17:17 +00:00			`#tb_logger.add_graph(model.netsG['generator'].module, [train_data['LQ'].to('cuda'),`
			`# train_data['lq_fullsize_ref'].float().to('cuda'),`
			`# train_data['lq_center'].to('cuda')])`

mmsr 2019-08-23 13:42:47 +00:00			`current_step += 1`
			`if current_step > total_iters:`
			`break`
			`#### update learning rate`
			`model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter'])`

			`#### training`
Misc 2020-06-18 17:28:55 +00:00			`if _profile:`
			`print("Update LR: %f" % (time() - _t))`
			`_t = time()`
mmsr 2019-08-23 13:42:47 +00:00			`model.feed_data(train_data)`
			`model.optimize_parameters(current_step)`
Misc 2020-06-18 17:28:55 +00:00			`if _profile:`
			`print("Model feed + step: %f" % (time() - _t))`
			`_t = time()`
mmsr 2019-08-23 13:42:47 +00:00
			`#### log`
			`if current_step % opt['logger']['print_freq'] == 0:`
Add doResizeLoss to dataset doResizeLoss has a 50% chance to resize the LQ image to 50% size, then back to original size. This is useful to training a generator to recover these lost pixel values while also being able to do repairs on higher resolution images during training. 2020-06-08 17:27:06 +00:00			`logs = model.get_current_log(current_step)`
mmsr 2019-08-23 13:42:47 +00:00			`message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format(epoch, current_step)`
			`for v in model.get_current_learning_rate():`
			`message += '{:.3e},'.format(v)`
			`message += ')] '`
			`for k, v in logs.items():`
Output histograms with SwitchedResidualGenerator This also fixes the initialization weight for the configurable generator. 2020-06-16 21:54:37 +00:00			`if 'histogram' in k:`
Fix illegal tb_logger use in distributed training 2020-07-23 15:14:01 +00:00			`if rank <= 0:`
			`tb_logger.add_histogram(k, v, current_step)`
Output histograms with SwitchedResidualGenerator This also fixes the initialization weight for the configurable generator. 2020-06-16 21:54:37 +00:00			`else:`
			`message += '{:s}: {:.4e} '.format(k, v)`
			`# tensorboard logger`
			`if opt['use_tb_logger'] and 'debug' not in opt['name']:`
			`if rank <= 0:`
			`tb_logger.add_scalar(k, v, current_step)`
mmsr 2019-08-23 13:42:47 +00:00			`if rank <= 0:`
			`logger.info(message)`
Save models before validation Validation often fails with OOM, wasting hours of training time. Save models first. 2020-09-16 14:17:17 +00:00
			`#### save models and training states`
			`if current_step % opt['logger']['save_checkpoint_freq'] == 0:`
			`if rank <= 0:`
			`logger.info('Saving models and training states.')`
			`model.save(current_step)`
			`model.save_training_state(epoch, current_step)`
			`if 'alt_path' in opt['path'].keys():`
			`import shutil`
			`print("Synchronizing tb_logger to alt_path..")`
			`alt_tblogger = os.path.join(opt['path']['alt_path'], "tb_logger")`
			`shutil.rmtree(alt_tblogger, ignore_errors=True)`
			`shutil.copytree(tb_logger_path, alt_tblogger)`

mmsr 2019-08-23 13:42:47 +00:00			`#### validation`
			`if opt['datasets'].get('val', None) and current_step % opt['train']['val_freq'] == 0:`
Fix val behavior for ExtensibleTrainer 2020-08-26 14:44:22 +00:00			`if opt['model'] in ['sr', 'srgan', 'corruptgan', 'spsrgan', 'extensibletrainer'] and rank <= 0: # image restoration validation`
Restore swapout models just before a checkpoint 2020-05-16 13:45:19 +00:00			`model.force_restore_swapout()`
Allow validating in batches, remove val size limit 2020-06-02 14:41:22 +00:00			`val_batch_sz = 1 if 'batch_size' not in opt['datasets']['val'].keys() else opt['datasets']['val']['batch_size']`
mmsr 2019-08-23 13:42:47 +00:00			`# does not support multi-GPU validation`
Allow validating in batches, remove val size limit 2020-06-02 14:41:22 +00:00			`pbar = util.ProgressBar(len(val_loader) * val_batch_sz)`
mmsr 2019-08-23 13:42:47 +00:00			`avg_psnr = 0.`
Add a feature-based validation test 2020-07-03 21:18:57 +00:00			`avg_fea_loss = 0.`
mmsr 2019-08-23 13:42:47 +00:00			`idx = 0`
Introduce (untested) colab mode 2020-06-01 21:09:52 +00:00			`colab_imgs_to_copy = []`
mmsr 2019-08-23 13:42:47 +00:00			`for val_data in val_loader:`
			`idx += 1`
Allow validating in batches, remove val size limit 2020-06-02 14:41:22 +00:00			`for b in range(len(val_data['LQ_path'])):`
			`img_name = os.path.splitext(os.path.basename(val_data['LQ_path'][b]))[0]`
			`img_dir = os.path.join(opt['path']['val_images'], img_name)`
			`util.mkdir(img_dir)`
mmsr 2019-08-23 13:42:47 +00:00
Allow validating in batches, remove val size limit 2020-06-02 14:41:22 +00:00			`model.feed_data(val_data)`
			`model.test()`
mmsr 2019-08-23 13:42:47 +00:00
Allow validating in batches, remove val size limit 2020-06-02 14:41:22 +00:00			`visuals = model.get_current_visuals()`
Add feature_model for training custom feature nets 2020-07-31 17:20:39 +00:00			`if visuals is None:`
			`continue`
Fix skips & images samples - Makes skip connections between the generator and discriminator more extensible by adding additional configuration options for them and supporting 1 and 0 skips. - Places the temp/ directory with sample images from the training process appear in the training directory instead of the codes/ directory. 2020-05-15 19:50:49 +00:00
Allow validating in batches, remove val size limit 2020-06-02 14:41:22 +00:00			`sr_img = util.tensor2img(visuals['rlt'][b]) # uint8`
Fix new feature loss calc 2020-07-04 04:20:13 +00:00			`#gt_img = util.tensor2img(visuals['GT'][b]) # uint8`
mmsr 2019-08-23 13:42:47 +00:00
Allow validating in batches, remove val size limit 2020-06-02 14:41:22 +00:00			`# Save SR images for reference`
			`img_base_name = '{:s}_{:d}.png'.format(img_name, current_step)`
			`save_img_path = os.path.join(img_dir, img_base_name)`
			`util.save_img(sr_img, save_img_path)`
			`if colab_mode:`
			`colab_imgs_to_copy.append(save_img_path)`
mmsr 2019-08-23 13:42:47 +00:00
Add a feature-based validation test 2020-07-03 21:18:57 +00:00			`# calculate PSNR (Naw - don't do that. PSNR sucks)`
			`#sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale'])`
			`#avg_psnr += util.calculate_psnr(sr_img, gt_img)`
			`#pbar.update('Test {}'.format(img_name))`

			`# calculate fea loss`
			`avg_fea_loss += model.compute_fea_loss(visuals['rlt'][b], visuals['GT'][b])`
mmsr 2019-08-23 13:42:47 +00:00
Introduce (untested) colab mode 2020-06-01 21:09:52 +00:00			`if colab_mode:`
			`util.copy_files_to_server(opt['ssh_server'], opt['ssh_username'], opt['ssh_password'],`
			`colab_imgs_to_copy,`
			`os.path.join(opt['remote_path'], 'val_images', img_base_name))`

mmsr 2019-08-23 13:42:47 +00:00			`avg_psnr = avg_psnr / idx`
Add a feature-based validation test 2020-07-03 21:18:57 +00:00			`avg_fea_loss = avg_fea_loss / idx`
mmsr 2019-08-23 13:42:47 +00:00
			`# log`
Add a feature-based validation test 2020-07-03 21:18:57 +00:00			`logger.info('# Validation # PSNR: {:.4e} Fea: {:.4e}'.format(avg_psnr, avg_fea_loss))`
mmsr 2019-08-23 13:42:47 +00:00			`# tensorboard logger`
			`if opt['use_tb_logger'] and 'debug' not in opt['name']:`
Fix new feature loss calc 2020-07-04 04:20:13 +00:00			`#tb_logger.add_scalar('val_psnr', avg_psnr, current_step)`
Add a feature-based validation test 2020-07-03 21:18:57 +00:00			`tb_logger.add_scalar('val_fea', avg_fea_loss, current_step)`
mmsr 2019-08-23 13:42:47 +00:00
			`if rank <= 0:`
			`logger.info('Saving the final model.')`
			`model.save('latest')`
			`logger.info('End of training.')`
			`tb_logger.close()`


			`if __name__ == '__main__':`
			`main()`