forked from mrq/DL-Art-School
Guided diffusion documentation
This commit is contained in:
parent
ae8de0cb9d
commit
730c0135fd
|
@ -1,109 +0,0 @@
|
||||||
#### general settings
|
|
||||||
name: train_imgset_rrdb_diffusion
|
|
||||||
model: extensibletrainer
|
|
||||||
scale: 1
|
|
||||||
gpu_ids: [0]
|
|
||||||
start_step: -1
|
|
||||||
checkpointing_enabled: true
|
|
||||||
fp16: false
|
|
||||||
use_tb_logger: true
|
|
||||||
wandb: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
train:
|
|
||||||
n_workers: 4
|
|
||||||
batch_size: 32
|
|
||||||
name: div2k
|
|
||||||
mode: single_image_extensible
|
|
||||||
paths: /content/div2k # <-- Put your path here.
|
|
||||||
target_size: 128
|
|
||||||
force_multiple: 1
|
|
||||||
scale: 4
|
|
||||||
num_corrupts_per_image: 0
|
|
||||||
|
|
||||||
networks:
|
|
||||||
generator:
|
|
||||||
type: generator
|
|
||||||
which_model_G: rrdb_diffusion
|
|
||||||
args:
|
|
||||||
in_channels: 6
|
|
||||||
out_channels: 6
|
|
||||||
num_blocks: 10
|
|
||||||
|
|
||||||
#### path
|
|
||||||
path:
|
|
||||||
#pretrain_model_generator: <insert pretrained model path if desired>
|
|
||||||
strict_load: true
|
|
||||||
#resume_state: ../experiments/train_imgset_rrdb_diffusion/training_state/0.state # <-- Set this to resume from a previous training state.
|
|
||||||
|
|
||||||
steps:
|
|
||||||
generator:
|
|
||||||
training: generator
|
|
||||||
|
|
||||||
optimizer_params:
|
|
||||||
lr: !!float 3e-4
|
|
||||||
weight_decay: !!float 1e-2
|
|
||||||
beta1: 0.9
|
|
||||||
beta2: 0.9999
|
|
||||||
|
|
||||||
injectors:
|
|
||||||
# "Do it all injector": produces a reverse prediction and calculates losses on it.
|
|
||||||
diffusion:
|
|
||||||
type: gaussian_diffusion
|
|
||||||
in: hq
|
|
||||||
generator: generator
|
|
||||||
beta_schedule:
|
|
||||||
schedule_name: linear
|
|
||||||
num_diffusion_timesteps: 4000
|
|
||||||
diffusion_args:
|
|
||||||
model_mean_type: epsilon
|
|
||||||
model_var_type: learned_range
|
|
||||||
loss_type: mse
|
|
||||||
sampler_type: uniform
|
|
||||||
model_input_keys:
|
|
||||||
low_res: lq
|
|
||||||
out: loss
|
|
||||||
|
|
||||||
# Injector for visualizing what your network is doing (every 500 steps)
|
|
||||||
visual_debug:
|
|
||||||
every: 500
|
|
||||||
type: gaussian_diffusion_inference
|
|
||||||
generator: generator
|
|
||||||
output_shape: [8,3,128,128] # Change "8" to your desired output batch size.
|
|
||||||
beta_schedule:
|
|
||||||
schedule_name: linear
|
|
||||||
num_diffusion_timesteps: 500 # Change higher (up to training steps) for improved quality. Lower for faster speed.
|
|
||||||
diffusion_args:
|
|
||||||
model_mean_type: epsilon
|
|
||||||
model_var_type: learned_range
|
|
||||||
loss_type: mse
|
|
||||||
model_input_keys:
|
|
||||||
low_res: lq
|
|
||||||
out: sample
|
|
||||||
|
|
||||||
losses:
|
|
||||||
diffusion_loss:
|
|
||||||
type: direct
|
|
||||||
weight: 1
|
|
||||||
key: loss
|
|
||||||
|
|
||||||
train:
|
|
||||||
niter: 500000
|
|
||||||
warmup_iter: -1
|
|
||||||
mega_batch_factor: 1 # <-- Gradient accumulation factor. If you are running OOM, increase this to [2,4,8].
|
|
||||||
val_freq: 4000
|
|
||||||
|
|
||||||
# Default LR scheduler options
|
|
||||||
default_lr_scheme: CosineAnnealingLR_Restart
|
|
||||||
T_period: [ 200000, 200000 ]
|
|
||||||
warmup: 0
|
|
||||||
eta_min: !!float 1e-7
|
|
||||||
restarts: [ 200000, 400000 ]
|
|
||||||
restart_weights: [ .5, .5 ]
|
|
||||||
|
|
||||||
logger:
|
|
||||||
print_freq: 30
|
|
||||||
save_checkpoint_freq: 2000
|
|
||||||
visuals: [sample, hq, lq]
|
|
||||||
visual_debug_rate: 500
|
|
||||||
reverse_n1_to_1: true
|
|
36
recipes/diffusion/README.md
Normal file
36
recipes/diffusion/README.md
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
# Working with Gaussian Diffusion models in DLAS
|
||||||
|
|
||||||
|
Diffusion Models are a method of generating structural data using a gradual de-noising process. This process allows a
|
||||||
|
simple network training regime.
|
||||||
|
|
||||||
|
This implementation of Gaussian Diffusion is largely based on the work done by OpenAI in their paper ["Diffusion Models
|
||||||
|
Beat GANs on Image Synthesis"](https://arxiv.org/pdf/2105.05233.pdf) and ["Improved Denoising Diffusion Probabilistic
|
||||||
|
Models"](https://arxiv.org/pdf/2102.09672).
|
||||||
|
|
||||||
|
OpenAI opened sourced their reference implementations [here](https://github.com/openai/guided-diffusion). The diffusion
|
||||||
|
model that DLAS trains uses the [gaussian_diffusion.py](https://github.com/openai/guided-diffusion/blob/main/guided_diffusion/gaussian_diffusion.py)
|
||||||
|
script from that repo for training and inference with these models. We also include the UNet from that repo as a model
|
||||||
|
that can be used to train a diffusion network.
|
||||||
|
|
||||||
|
Diffusion networks can be re-purposed to pretty much any image generation task, including super-resolution. Even though
|
||||||
|
they are trained with MSE losses, they produce incredibly crisp images with FID scores competitive with the best GANs.
|
||||||
|
More importantly, it is easy to track training progress since diffusion networks use a "normal" loss.
|
||||||
|
|
||||||
|
Diffusion networks are unique in that during inference, they perform multiple forward passes to generate a single image.
|
||||||
|
During training, these networks are trained to denoise images over 4000 steps. In inference, this sample rate can be
|
||||||
|
adjusted. For the purposes of super-resolution, I have found that images sampled in 50 steps to be of very good quality.
|
||||||
|
This still means that a diffusion generator is 50x slower than generators trained in different ways.
|
||||||
|
|
||||||
|
What's more is that I have found that diffusion networks can be trained in the tiled methodology used by ESRGAN: instead
|
||||||
|
of training on whole images, you can train on tiles of larger images. At inference time, the network can be applied to
|
||||||
|
larger images than the network was initially trained on. I have found this works well on inference images within ~3x
|
||||||
|
the training size. I have not tried larger, because the size of the UNet model means that inference at ultra-high
|
||||||
|
resolutions is impossible (I run out of GPU memory).
|
||||||
|
|
||||||
|
I have provided a reference configuration for training a diffusion model in this manner. The config performs a 2x
|
||||||
|
upsampling to 256px, de-blurs it and removes JPEG artifacts. The deblurring and image repairs are done on a configurable
|
||||||
|
scale. The scale is [0,1] passed to the model as `corruption_entropy`. `1` represents a maximum correction factor.
|
||||||
|
You can try reducing this to 128px for faster training. It should work fine.
|
||||||
|
|
||||||
|
Diffusion models also have a fairly arcane inference method. To help you along, I've provided an inference configuration
|
||||||
|
that can be used with models trained in DLAS.
|
78
recipes/diffusion/test_diffusion_unet.yml
Normal file
78
recipes/diffusion/test_diffusion_unet.yml
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
#### general settings
|
||||||
|
name: test_diffusion_unet
|
||||||
|
use_tb_logger: true
|
||||||
|
model: extensibletrainer
|
||||||
|
scale: 1
|
||||||
|
gpu_ids: [0]
|
||||||
|
start_step: -1
|
||||||
|
checkpointing_enabled: true
|
||||||
|
fp16: false
|
||||||
|
wandb: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
train:
|
||||||
|
name: my_inference_images
|
||||||
|
n_workers: 0
|
||||||
|
batch_size: 1
|
||||||
|
mode: imagefolder
|
||||||
|
rgb_n1_to_1: true
|
||||||
|
disable_flip: true
|
||||||
|
force_square: false
|
||||||
|
paths: <low resolution images you want to upsample>
|
||||||
|
scale: 1
|
||||||
|
skip_lq: true
|
||||||
|
fixed_parameters:
|
||||||
|
# Specify correction factors here. For networks trained with the paired training configuration, the first number
|
||||||
|
# is a JPEG correction factor, and the second number is a deblurring factor. Testing shows that if you attempt to
|
||||||
|
# deblur too far, you get extremely distorted images. It's actually pretty cool - the network clearly knows how
|
||||||
|
# much deblurring is appropriate.
|
||||||
|
corruption_entropy: [.2, .5]
|
||||||
|
|
||||||
|
networks:
|
||||||
|
generator:
|
||||||
|
type: generator
|
||||||
|
which_model_G: unet_diffusion
|
||||||
|
args:
|
||||||
|
image_size: 256
|
||||||
|
in_channels: 3
|
||||||
|
num_corruptions: 2
|
||||||
|
model_channels: 192
|
||||||
|
out_channels: 6
|
||||||
|
num_res_blocks: 2
|
||||||
|
attention_resolutions: [8,16]
|
||||||
|
dropout: 0
|
||||||
|
channel_mult: [1,1,2,2,4,4]
|
||||||
|
num_heads: 4
|
||||||
|
num_heads_upsample: -1
|
||||||
|
use_scale_shift_norm: true
|
||||||
|
|
||||||
|
#### path
|
||||||
|
path:
|
||||||
|
pretrain_model_generator: <Your model (or EMA) path>
|
||||||
|
strict_load: true
|
||||||
|
|
||||||
|
steps:
|
||||||
|
generator:
|
||||||
|
training: generator
|
||||||
|
injectors:
|
||||||
|
visual_debug:
|
||||||
|
type: gaussian_diffusion_inference
|
||||||
|
generator: generator
|
||||||
|
output_batch_size: 1
|
||||||
|
output_scale_factor: 2
|
||||||
|
respaced_timestep_spacing: 50 # This can be tweaked to perform inference faster or slower. 50-200 seems to be the sweet spot. At 4000 steps, the quality is actually worse often.
|
||||||
|
undo_n1_to_1: true
|
||||||
|
beta_schedule:
|
||||||
|
schedule_name: linear
|
||||||
|
num_diffusion_timesteps: 4000
|
||||||
|
diffusion_args:
|
||||||
|
model_mean_type: epsilon
|
||||||
|
model_var_type: learned_range
|
||||||
|
loss_type: mse
|
||||||
|
model_input_keys:
|
||||||
|
low_res: hq
|
||||||
|
corruption_factor: corruption_entropy
|
||||||
|
out: sample
|
||||||
|
|
||||||
|
eval:
|
||||||
|
output_state: sample
|
150
recipes/diffusion/train_ddpm_unet.yml
Normal file
150
recipes/diffusion/train_ddpm_unet.yml
Normal file
|
@ -0,0 +1,150 @@
|
||||||
|
name: train_unet_diffusion
|
||||||
|
use_tb_logger: true
|
||||||
|
model: extensibletrainer
|
||||||
|
scale: 1
|
||||||
|
gpu_ids: [0]
|
||||||
|
start_step: -1
|
||||||
|
checkpointing_enabled: true # If using the UNet architecture, this is pretty much required.
|
||||||
|
fp16: false
|
||||||
|
wandb: false # Set to true to enable wandb logging.
|
||||||
|
force_start_step: -1
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
train:
|
||||||
|
name: imgset5
|
||||||
|
n_workers: 4
|
||||||
|
batch_size: 256 # The OpenAI paper uses this batch size for 256px generation. The UNet model uses attention, which benefits from large batch sizes.
|
||||||
|
mode: imagefolder
|
||||||
|
rgb_n1_to_1: true
|
||||||
|
paths: <insert path to a folder full of 256x256 tiled images here>
|
||||||
|
target_size: 256
|
||||||
|
scale: 2
|
||||||
|
fixed_corruptions: [ jpeg-broad, gaussian_blur ] # This model is trained to correct JPEG artifacts and blurring.
|
||||||
|
random_corruptions: [ none ]
|
||||||
|
num_corrupts_per_image: 1
|
||||||
|
corruption_blur_scale: 1
|
||||||
|
corrupt_before_downsize: false
|
||||||
|
|
||||||
|
networks:
|
||||||
|
generator:
|
||||||
|
type: generator
|
||||||
|
which_model_G: unet_diffusion
|
||||||
|
args:
|
||||||
|
image_size: 256
|
||||||
|
in_channels: 3
|
||||||
|
num_corruptions: 2
|
||||||
|
model_channels: 192
|
||||||
|
out_channels: 6
|
||||||
|
num_res_blocks: 2
|
||||||
|
attention_resolutions: [8,16]
|
||||||
|
dropout: 0
|
||||||
|
channel_mult: [1,1,2,2,4,4] # These will need to be reduced if you lower the operating resolution.
|
||||||
|
num_heads: 4
|
||||||
|
num_heads_upsample: -1
|
||||||
|
use_scale_shift_norm: true
|
||||||
|
|
||||||
|
#### path
|
||||||
|
path:
|
||||||
|
#pretrain_model_generator: <Insert pretrained generator here>
|
||||||
|
strict_load: true
|
||||||
|
#resume_state: <Insert resume training_state here to resume existing training>
|
||||||
|
|
||||||
|
steps:
|
||||||
|
generator:
|
||||||
|
training: generator
|
||||||
|
|
||||||
|
optimizer: adamw
|
||||||
|
optimizer_params:
|
||||||
|
lr: !!float 3e-4 # Hyperparameters from OpenAI paper.
|
||||||
|
weight_decay: 0
|
||||||
|
beta1: 0.9
|
||||||
|
beta2: 0.9999
|
||||||
|
|
||||||
|
injectors:
|
||||||
|
diffusion:
|
||||||
|
type: gaussian_diffusion
|
||||||
|
in: hq
|
||||||
|
generator: generator
|
||||||
|
beta_schedule:
|
||||||
|
schedule_name: linear
|
||||||
|
num_diffusion_timesteps: 4000
|
||||||
|
diffusion_args:
|
||||||
|
model_mean_type: epsilon
|
||||||
|
model_var_type: learned_range
|
||||||
|
loss_type: mse
|
||||||
|
sampler_type: uniform
|
||||||
|
model_input_keys:
|
||||||
|
low_res: lq
|
||||||
|
corruption_factor: corruption_entropy
|
||||||
|
out: loss
|
||||||
|
out_key_vb_loss: vb_loss
|
||||||
|
out_key_x_start: x_start_pred
|
||||||
|
losses:
|
||||||
|
diffusion_loss:
|
||||||
|
type: direct
|
||||||
|
weight: 1
|
||||||
|
key: loss
|
||||||
|
var_loss:
|
||||||
|
type: direct
|
||||||
|
weight: 1
|
||||||
|
key: vb_loss
|
||||||
|
|
||||||
|
train:
|
||||||
|
niter: 500000
|
||||||
|
warmup_iter: -1
|
||||||
|
mega_batch_factor: 32 # This is massive. Expect ~60sec/step on a RTX3090 at 90%+ memory utilization. I recommend using multiple GPUs to train this network.
|
||||||
|
ema_rate: .999
|
||||||
|
val_freq: 500
|
||||||
|
|
||||||
|
default_lr_scheme: MultiStepLR
|
||||||
|
gen_lr_steps: [ 50000, 100000, 150000 ]
|
||||||
|
lr_gamma: 0.5
|
||||||
|
|
||||||
|
eval:
|
||||||
|
evaluators:
|
||||||
|
# Validation for this network is a special FID computation that compares the full resolution images from the specified
|
||||||
|
# dataset to the same images, downsampled and corrupted then fed through the network.
|
||||||
|
fid:
|
||||||
|
type: sr_diffusion_fid
|
||||||
|
for: generator # Unused for this evaluator.
|
||||||
|
batch_size: 8
|
||||||
|
dataset:
|
||||||
|
name: sr_fid_set
|
||||||
|
mode: imagefolder
|
||||||
|
rgb_n1_to_1: true
|
||||||
|
paths: <insert path to a folder of 128-512 validation images here, drawn from your dataset>
|
||||||
|
target_size: 256
|
||||||
|
scale: 2
|
||||||
|
fixed_corruptions: [ jpeg-broad, gaussian_blur ]
|
||||||
|
random_corruptions: [ none ]
|
||||||
|
num_corrupts_per_image: 1
|
||||||
|
corruption_blur_scale: 1
|
||||||
|
corrupt_before_downsize: false
|
||||||
|
random_seed: 1234
|
||||||
|
diffusion_params:
|
||||||
|
type: gaussian_diffusion_inference
|
||||||
|
generator: generator
|
||||||
|
use_ema_model: true
|
||||||
|
output_batch_size: 8
|
||||||
|
output_scale_factor: 2
|
||||||
|
respaced_timestep_spacing: 50
|
||||||
|
undo_n1_to_1: true
|
||||||
|
beta_schedule:
|
||||||
|
schedule_name: linear
|
||||||
|
num_diffusion_timesteps: 4000
|
||||||
|
diffusion_args:
|
||||||
|
model_mean_type: epsilon
|
||||||
|
model_var_type: learned_range
|
||||||
|
loss_type: mse
|
||||||
|
model_input_keys:
|
||||||
|
low_res: lq
|
||||||
|
corruption_factor: corruption_entropy
|
||||||
|
out: sample # Unused
|
||||||
|
|
||||||
|
logger:
|
||||||
|
print_freq: 30
|
||||||
|
save_checkpoint_freq: 500
|
||||||
|
visuals: [x_start_pred, hq, lq]
|
||||||
|
visual_debug_rate: 500
|
||||||
|
reverse_n1_to_1: true
|
||||||
|
reverse_imagenet_norm: false
|
Loading…
Reference in New Issue
Block a user