forked from mrq/DL-Art-School
Guided diffusion documentation
This commit is contained in:
parent
ae8de0cb9d
commit
730c0135fd
|
@ -1,109 +0,0 @@
|
|||
#### general settings
|
||||
name: train_imgset_rrdb_diffusion
|
||||
model: extensibletrainer
|
||||
scale: 1
|
||||
gpu_ids: [0]
|
||||
start_step: -1
|
||||
checkpointing_enabled: true
|
||||
fp16: false
|
||||
use_tb_logger: true
|
||||
wandb: false
|
||||
|
||||
datasets:
|
||||
train:
|
||||
n_workers: 4
|
||||
batch_size: 32
|
||||
name: div2k
|
||||
mode: single_image_extensible
|
||||
paths: /content/div2k # <-- Put your path here.
|
||||
target_size: 128
|
||||
force_multiple: 1
|
||||
scale: 4
|
||||
num_corrupts_per_image: 0
|
||||
|
||||
networks:
|
||||
generator:
|
||||
type: generator
|
||||
which_model_G: rrdb_diffusion
|
||||
args:
|
||||
in_channels: 6
|
||||
out_channels: 6
|
||||
num_blocks: 10
|
||||
|
||||
#### path
|
||||
path:
|
||||
#pretrain_model_generator: <insert pretrained model path if desired>
|
||||
strict_load: true
|
||||
#resume_state: ../experiments/train_imgset_rrdb_diffusion/training_state/0.state # <-- Set this to resume from a previous training state.
|
||||
|
||||
steps:
|
||||
generator:
|
||||
training: generator
|
||||
|
||||
optimizer_params:
|
||||
lr: !!float 3e-4
|
||||
weight_decay: !!float 1e-2
|
||||
beta1: 0.9
|
||||
beta2: 0.9999
|
||||
|
||||
injectors:
|
||||
# "Do it all injector": produces a reverse prediction and calculates losses on it.
|
||||
diffusion:
|
||||
type: gaussian_diffusion
|
||||
in: hq
|
||||
generator: generator
|
||||
beta_schedule:
|
||||
schedule_name: linear
|
||||
num_diffusion_timesteps: 4000
|
||||
diffusion_args:
|
||||
model_mean_type: epsilon
|
||||
model_var_type: learned_range
|
||||
loss_type: mse
|
||||
sampler_type: uniform
|
||||
model_input_keys:
|
||||
low_res: lq
|
||||
out: loss
|
||||
|
||||
# Injector for visualizing what your network is doing (every 500 steps)
|
||||
visual_debug:
|
||||
every: 500
|
||||
type: gaussian_diffusion_inference
|
||||
generator: generator
|
||||
output_shape: [8,3,128,128] # Change "8" to your desired output batch size.
|
||||
beta_schedule:
|
||||
schedule_name: linear
|
||||
num_diffusion_timesteps: 500 # Change higher (up to training steps) for improved quality. Lower for faster speed.
|
||||
diffusion_args:
|
||||
model_mean_type: epsilon
|
||||
model_var_type: learned_range
|
||||
loss_type: mse
|
||||
model_input_keys:
|
||||
low_res: lq
|
||||
out: sample
|
||||
|
||||
losses:
|
||||
diffusion_loss:
|
||||
type: direct
|
||||
weight: 1
|
||||
key: loss
|
||||
|
||||
train:
|
||||
niter: 500000
|
||||
warmup_iter: -1
|
||||
mega_batch_factor: 1 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
||||
val_freq: 4000
|
||||
|
||||
# Default LR scheduler options
|
||||
default_lr_scheme: CosineAnnealingLR_Restart
|
||||
T_period: [ 200000, 200000 ]
|
||||
warmup: 0
|
||||
eta_min: !!float 1e-7
|
||||
restarts: [ 200000, 400000 ]
|
||||
restart_weights: [ .5, .5 ]
|
||||
|
||||
logger:
|
||||
print_freq: 30
|
||||
save_checkpoint_freq: 2000
|
||||
visuals: [sample, hq, lq]
|
||||
visual_debug_rate: 500
|
||||
reverse_n1_to_1: true
|
36
recipes/diffusion/README.md
Normal file
36
recipes/diffusion/README.md
Normal file
|
@ -0,0 +1,36 @@
|
|||
# Working with Gaussian Diffusion models in DLAS
|
||||
|
||||
Diffusion Models are a method of generating structural data using a gradual de-noising process. This process allows a
|
||||
simple network training regime.
|
||||
|
||||
This implementation of Gaussian Diffusion is largely based on the work done by OpenAI in their paper ["Diffusion Models
|
||||
Beat GANs on Image Synthesis"](https://arxiv.org/pdf/2105.05233.pdf) and ["Improved Denoising Diffusion Probabilistic
|
||||
Models"](https://arxiv.org/pdf/2102.09672).
|
||||
|
||||
OpenAI opened sourced their reference implementations [here](https://github.com/openai/guided-diffusion). The diffusion
|
||||
model that DLAS trains uses the [gaussian_diffusion.py](https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/gaussian_diffusion.py)
|
||||
script from that repo for training and inference with these models. We also include the UNet from that repo as a model
|
||||
that can be used to train a diffusion network.
|
||||
|
||||
Diffusion networks can be re-purposed to pretty much any image generation task, including super-resolution. Even though
|
||||
they are trained with MSE losses, they produce incredibly crisp images with FID scores competitive with the best GANs.
|
||||
More importantly, it is easy to track training progress since diffusion networks use a "normal" loss.
|
||||
|
||||
Diffusion networks are unique in that during inference, they perform multiple forward passes to generate a single image.
|
||||
During training, these networks are trained to denoise images over 4000 steps. In inference, this sample rate can be
|
||||
adjusted. For the purposes of super-resolution, I have found that images sampled in 50 steps to be of very good quality.
|
||||
This still means that a diffusion generator is 50x slower than generators trained in different ways.
|
||||
|
||||
What's more is that I have found that diffusion networks can be trained in the tiled methodology used by ESRGAN: instead
|
||||
of training on whole images, you can train on tiles of larger images. At inference time, the network can be applied to
|
||||
larger images than the network was initially trained on. I have found this works well on inference images within ~3x
|
||||
the training size. I have not tried larger, because the size of the UNet model means that inference at ultra-high
|
||||
resolutions is impossible (I run out of GPU memory).
|
||||
|
||||
I have provided a reference configuration for training a diffusion model in this manner. The config performs a 2x
|
||||
upsampling to 256px, de-blurs it and removes JPEG artifacts. The deblurring and image repairs are done on a configurable
|
||||
scale. The scale is [0,1] passed to the model as `corruption_entropy`. `1` represents a maximum correction factor.
|
||||
You can try reducing this to 128px for faster training. It should work fine.
|
||||
|
||||
Diffusion models also have a fairly arcane inference method. To help you along, I've provided an inference configuration
|
||||
that can be used with models trained in DLAS.
|
78
recipes/diffusion/test_diffusion_unet.yml
Normal file
78
recipes/diffusion/test_diffusion_unet.yml
Normal file
|
@ -0,0 +1,78 @@
|
|||
#### general settings
|
||||
name: test_diffusion_unet
|
||||
use_tb_logger: true
|
||||
model: extensibletrainer
|
||||
scale: 1
|
||||
gpu_ids: [0]
|
||||
start_step: -1
|
||||
checkpointing_enabled: true
|
||||
fp16: false
|
||||
wandb: false
|
||||
|
||||
datasets:
|
||||
train:
|
||||
name: my_inference_images
|
||||
n_workers: 0
|
||||
batch_size: 1
|
||||
mode: imagefolder
|
||||
rgb_n1_to_1: true
|
||||
disable_flip: true
|
||||
force_square: false
|
||||
paths: <low resolution images you want to upsample>
|
||||
scale: 1
|
||||
skip_lq: true
|
||||
fixed_parameters:
|
||||
# Specify correction factors here. For networks trained with the paired training configuration, the first number
|
||||
# is a JPEG correction factor, and the second number is a deblurring factor. Testing shows that if you attempt to
|
||||
# deblur too far, you get extremely distorted images. It's actually pretty cool - the network clearly knows how
|
||||
# much deblurring is appropriate.
|
||||
corruption_entropy: [.2, .5]
|
||||
|
||||
networks:
|
||||
generator:
|
||||
type: generator
|
||||
which_model_G: unet_diffusion
|
||||
args:
|
||||
image_size: 256
|
||||
in_channels: 3
|
||||
num_corruptions: 2
|
||||
model_channels: 192
|
||||
out_channels: 6
|
||||
num_res_blocks: 2
|
||||
attention_resolutions: [8,16]
|
||||
dropout: 0
|
||||
channel_mult: [1,1,2,2,4,4]
|
||||
num_heads: 4
|
||||
num_heads_upsample: -1
|
||||
use_scale_shift_norm: true
|
||||
|
||||
#### path
|
||||
path:
|
||||
pretrain_model_generator: <Your model (or EMA) path>
|
||||
strict_load: true
|
||||
|
||||
steps:
|
||||
generator:
|
||||
training: generator
|
||||
injectors:
|
||||
visual_debug:
|
||||
type: gaussian_diffusion_inference
|
||||
generator: generator
|
||||
output_batch_size: 1
|
||||
output_scale_factor: 2
|
||||
respaced_timestep_spacing: 50 # This can be tweaked to perform inference faster or slower. 50-200 seems to be the sweet spot. At 4000 steps, the quality is actually worse often.
|
||||
undo_n1_to_1: true
|
||||
beta_schedule:
|
||||
schedule_name: linear
|
||||
num_diffusion_timesteps: 4000
|
||||
diffusion_args:
|
||||
model_mean_type: epsilon
|
||||
model_var_type: learned_range
|
||||
loss_type: mse
|
||||
model_input_keys:
|
||||
low_res: hq
|
||||
corruption_factor: corruption_entropy
|
||||
out: sample
|
||||
|
||||
eval:
|
||||
output_state: sample
|
150
recipes/diffusion/train_ddpm_unet.yml
Normal file
150
recipes/diffusion/train_ddpm_unet.yml
Normal file
|
@ -0,0 +1,150 @@
|
|||
name: train_unet_diffusion
|
||||
use_tb_logger: true
|
||||
model: extensibletrainer
|
||||
scale: 1
|
||||
gpu_ids: [0]
|
||||
start_step: -1
|
||||
checkpointing_enabled: true # If using the UNet architecture, this is pretty much required.
|
||||
fp16: false
|
||||
wandb: false # Set to true to enable wandb logging.
|
||||
force_start_step: -1
|
||||
|
||||
datasets:
|
||||
train:
|
||||
name: imgset5
|
||||
n_workers: 4
|
||||
batch_size: 256 # The OpenAI paper uses this batch size for 256px generation. The UNet model uses attention, which benefits from large batch sizes.
|
||||
mode: imagefolder
|
||||
rgb_n1_to_1: true
|
||||
paths: <insert path to a folder full of 256x256 tiled images here>
|
||||
target_size: 256
|
||||
scale: 2
|
||||
fixed_corruptions: [ jpeg-broad, gaussian_blur ] # This model is trained to correct JPEG artifacts and blurring.
|
||||
random_corruptions: [ none ]
|
||||
num_corrupts_per_image: 1
|
||||
corruption_blur_scale: 1
|
||||
corrupt_before_downsize: false
|
||||
|
||||
networks:
|
||||
generator:
|
||||
type: generator
|
||||
which_model_G: unet_diffusion
|
||||
args:
|
||||
image_size: 256
|
||||
in_channels: 3
|
||||
num_corruptions: 2
|
||||
model_channels: 192
|
||||
out_channels: 6
|
||||
num_res_blocks: 2
|
||||
attention_resolutions: [8,16]
|
||||
dropout: 0
|
||||
channel_mult: [1,1,2,2,4,4] # These will need to be reduced if you lower the operating resolution.
|
||||
num_heads: 4
|
||||
num_heads_upsample: -1
|
||||
use_scale_shift_norm: true
|
||||
|
||||
#### path
|
||||
path:
|
||||
#pretrain_model_generator: <Insert pretrained generator here>
|
||||
strict_load: true
|
||||
#resume_state: <Insert resume training_state here to resume existing training>
|
||||
|
||||
steps:
|
||||
generator:
|
||||
training: generator
|
||||
|
||||
optimizer: adamw
|
||||
optimizer_params:
|
||||
lr: !!float 3e-4 # Hyperparameters from OpenAI paper.
|
||||
weight_decay: 0
|
||||
beta1: 0.9
|
||||
beta2: 0.9999
|
||||
|
||||
injectors:
|
||||
diffusion:
|
||||
type: gaussian_diffusion
|
||||
in: hq
|
||||
generator: generator
|
||||
beta_schedule:
|
||||
schedule_name: linear
|
||||
num_diffusion_timesteps: 4000
|
||||
diffusion_args:
|
||||
model_mean_type: epsilon
|
||||
model_var_type: learned_range
|
||||
loss_type: mse
|
||||
sampler_type: uniform
|
||||
model_input_keys:
|
||||
low_res: lq
|
||||
corruption_factor: corruption_entropy
|
||||
out: loss
|
||||
out_key_vb_loss: vb_loss
|
||||
out_key_x_start: x_start_pred
|
||||
losses:
|
||||
diffusion_loss:
|
||||
type: direct
|
||||
weight: 1
|
||||
key: loss
|
||||
var_loss:
|
||||
type: direct
|
||||
weight: 1
|
||||
key: vb_loss
|
||||
|
||||
train:
|
||||
niter: 500000
|
||||
warmup_iter: -1
|
||||
mega_batch_factor: 32 # This is massive. Expect ~60sec/step on a RTX3090 at 90%+ memory utilization. I recommend using multiple GPUs to train this network.
|
||||
ema_rate: .999
|
||||
val_freq: 500
|
||||
|
||||
default_lr_scheme: MultiStepLR
|
||||
gen_lr_steps: [ 50000, 100000, 150000 ]
|
||||
lr_gamma: 0.5
|
||||
|
||||
eval:
|
||||
evaluators:
|
||||
# Validation for this network is a special FID computation that compares the full resolution images from the specified
|
||||
# dataset to the same images, downsampled and corrupted then fed through the network.
|
||||
fid:
|
||||
type: sr_diffusion_fid
|
||||
for: generator # Unused for this evaluator.
|
||||
batch_size: 8
|
||||
dataset:
|
||||
name: sr_fid_set
|
||||
mode: imagefolder
|
||||
rgb_n1_to_1: true
|
||||
paths: <insert path to a folder of 128-512 validation images here, drawn from your dataset>
|
||||
target_size: 256
|
||||
scale: 2
|
||||
fixed_corruptions: [ jpeg-broad, gaussian_blur ]
|
||||
random_corruptions: [ none ]
|
||||
num_corrupts_per_image: 1
|
||||
corruption_blur_scale: 1
|
||||
corrupt_before_downsize: false
|
||||
random_seed: 1234
|
||||
diffusion_params:
|
||||
type: gaussian_diffusion_inference
|
||||
generator: generator
|
||||
use_ema_model: true
|
||||
output_batch_size: 8
|
||||
output_scale_factor: 2
|
||||
respaced_timestep_spacing: 50
|
||||
undo_n1_to_1: true
|
||||
beta_schedule:
|
||||
schedule_name: linear
|
||||
num_diffusion_timesteps: 4000
|
||||
diffusion_args:
|
||||
model_mean_type: epsilon
|
||||
model_var_type: learned_range
|
||||
loss_type: mse
|
||||
model_input_keys:
|
||||
low_res: lq
|
||||
corruption_factor: corruption_entropy
|
||||
out: sample # Unused
|
||||
|
||||
logger:
|
||||
print_freq: 30
|
||||
save_checkpoint_freq: 500
|
||||
visuals: [x_start_pred, hq, lq]
|
||||
visual_debug_rate: 500
|
||||
reverse_n1_to_1: true
|
||||
reverse_imagenet_norm: false
|
Loading…
Reference in New Issue
Block a user