diff --git a/codes/models/archs/RRDBNet_arch.py b/codes/models/archs/RRDBNet_arch.py
index 0133f68b..26ddee36 100644
--- a/codes/models/archs/RRDBNet_arch.py
+++ b/codes/models/archs/RRDBNet_arch.py
@@ -1,293 +1,145 @@
-import functools
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import models.archs.arch_util as arch_util
-from models.archs.arch_util import PixelUnshuffle
-import torchvision
-from utils.util import checkpoint
+from torch.utils.checkpoint import checkpoint_sequential
+
+from models.archs.arch_util import make_layer, default_init_weights
 
 
-class ResidualDenseBlock_5C(nn.Module):
-    def __init__(self, nf=64, gc=32, bias=True, late_stage_kernel_size=3, late_stage_padding=1):
-        super(ResidualDenseBlock_5C, self).__init__()
-        # gc: growth channel, i.e. intermediate channels
-        self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
-        self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
-        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, late_stage_kernel_size, 1, late_stage_padding, bias=bias)
-        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, late_stage_kernel_size, 1, late_stage_padding, bias=bias)
-        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, late_stage_kernel_size, 1, late_stage_padding, bias=bias)
+class ResidualDenseBlock(nn.Module):
+    """Residual Dense Block.
+
+    Used in RRDB block in ESRGAN.
+
+    Args:
+        mid_channels (int): Channel number of intermediate features.
+        growth_channels (int): Channels for each growth.
+    """
+
+    def __init__(self, mid_channels=64, growth_channels=32):
+        super(ResidualDenseBlock, self).__init__()
+        for i in range(5):
+            out_channels = mid_channels if i == 4 else growth_channels
+            self.add_module(
+                f'conv{i+1}',
+                nn.Conv2d(mid_channels + i * growth_channels, out_channels, 3,
+                          1, 1))
         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+        for i in range(5):
+            default_init_weights(getattr(self, f'conv{i+1}'), 0.1)
 
-        # initialization
-        arch_util.initialize_weights([self.conv1, self.conv2, self.conv3, self.conv4, self.conv5],
-                                     0.1)
 
     def forward(self, x):
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
         x1 = self.lrelu(self.conv1(x))
         x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1)))
         x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1)))
         x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1)))
         x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1))
+        # Emperically, we use 0.2 to scale the residual for better performance
         return x5 * 0.2 + x
 
 
 class RRDB(nn.Module):
-    '''Residual in Residual Dense Block'''
+    """Residual in Residual Dense Block.
 
-    def __init__(self, nf, gc=32):
+    Used in RRDB-Net in ESRGAN.
+
+    Args:
+        mid_channels (int): Channel number of intermediate features.
+        growth_channels (int): Channels for each growth.
+    """
+
+    def __init__(self, mid_channels, growth_channels=32):
         super(RRDB, self).__init__()
-        self.RDB1 = ResidualDenseBlock_5C(nf, gc)
-        self.RDB2 = ResidualDenseBlock_5C(nf, gc)
-        self.RDB3 = ResidualDenseBlock_5C(nf, gc)
+        self.rdb1 = ResidualDenseBlock(mid_channels, growth_channels)
+        self.rdb2 = ResidualDenseBlock(mid_channels, growth_channels)
+        self.rdb3 = ResidualDenseBlock(mid_channels, growth_channels)
 
     def forward(self, x):
-        out = checkpoint(self.RDB1, x)
-        out = checkpoint(self.RDB2, out)
-        out = checkpoint(self.RDB3, out)
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
+
+        Returns:
+            Tensor: Forward results.
+        """
+        out = self.rdb1(x)
+        out = self.rdb2(out)
+        out = self.rdb3(out)
+        # Emperically, we use 0.2 to scale the residual for better performance
         return out * 0.2 + x
 
 
-class LowDimRRDB(RRDB):
-    def __init__(self, nf, gc=32, dimensional_adjustment=4):
-        super(LowDimRRDB, self).__init__(nf * (dimensional_adjustment ** 2), gc * (dimensional_adjustment ** 2))
-        self.unshuffle = PixelUnshuffle(dimensional_adjustment)
-        self.shuffle = nn.PixelShuffle(dimensional_adjustment)
+class RRDBNet(nn.Module):
+    """Networks consisting of Residual in Residual Dense Block, which is used
+    in ESRGAN.
 
-    def forward(self, x):
-        x = self.unshuffle(x)
-        x = super(LowDimRRDB, self).forward(x)
-        return self.shuffle(x)
+    ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks.
+    Currently, it supports x4 upsampling scale factor.
 
+    Args:
+        in_channels (int): Channel number of inputs.
+        out_channels (int): Channel number of outputs.
+        mid_channels (int): Channel number of intermediate features.
+            Default: 64
+        num_blocks (int): Block number in the trunk network. Defaults: 23
+        growth_channels (int): Channels for each growth. Default: 32.
+    """
 
-# Identical to LowDimRRDB but wraps an RRDB rather than inheriting from it. TODO: remove LowDimRRDB when backwards
-# compatibility is no longer desired.
-class LowDimRRDBWrapper(nn.Module):
-    # Do not specify nf or gc on the partial_rrdb passed in. That will be done by the wrapper.
-    def __init__(self, nf, partial_rrdb, gc=32, dimensional_adjustment=4):
-        super(LowDimRRDBWrapper, self).__init__()
-        self.rrdb = partial_rrdb(nf=nf * (dimensional_adjustment ** 2), gc=gc * (dimensional_adjustment ** 2))
-        self.unshuffle = PixelUnshuffle(dimensional_adjustment)
-        self.shuffle = nn.PixelShuffle(dimensional_adjustment)
-
-    def forward(self, x):
-        x = self.unshuffle(x)
-        x = self.rrdb(x)
-        return self.shuffle(x)
-
-
-# This module performs the majority of the processing done by RRDBNet. It just doesn't have the upsampling at the end.
-class RRDBTrunk(nn.Module):
-    def __init__(self, nf_in, nf_out, nb, gc=32, initial_stride=1, rrdb_block_f=None, conv_first_block=None):
-        super(RRDBTrunk, self).__init__()
-        if rrdb_block_f is None:
-            rrdb_block_f = functools.partial(RRDB, nf=nf_out, gc=gc)
-
-        if conv_first_block is None:
-            self.conv_first = nn.Conv2d(nf_in, nf_out, 7, initial_stride, padding=3, bias=True)
-        else:
-            self.conv_first = conv_first_block
-
-        self.RRDB_trunk, self.rrdb_layers = arch_util.make_layer(rrdb_block_f, nb, True)
-        self.trunk_conv = nn.Conv2d(nf_out, nf_out, 3, 1, 1, bias=True)
-
-    # Sets the softmax temperature of each RRDB layer. Only works if you are using attentive
-    # convolutions.
-    def set_temperature(self, temp):
-        for layer in self.rrdb_layers:
-            layer.set_temperature(temp)
-
-    def forward(self, x):
-        fea = self.conv_first(x)
-        trunk = self.trunk_conv(self.RRDB_trunk(fea))
-        fea = fea + trunk
-        return fea
-
-
-# Adds some base methods that all RRDB* classes will use.
-class RRDBBase(nn.Module):
-    def __init__(self):
-        super(RRDBBase, self).__init__()
-
-    # Sets the softmax temperature of each RRDB layer. Only works if you are using attentive
-    # convolutions.
-    def set_temperature(self, temp):
-        for trunk in self.trunks:
-            for layer in trunk.rrdb_layers:
-                layer.set_temperature(temp)
-
-
-# This class uses a RRDBTrunk to perform processing on an image, then upsamples it.
-class RRDBNet(RRDBBase):
-    def __init__(self, in_nc, out_nc, nf, nb, gc=32, scale=2, initial_stride=1,
-                 rrdb_block_f=None):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 mid_channels=64,
+                 num_blocks=23,
+                 growth_channels=32):
         super(RRDBNet, self).__init__()
-
-        # Trunk - does actual processing.
-        self.trunk = RRDBTrunk(in_nc, nf, nb, gc, initial_stride, rrdb_block_f)
-        self.trunks = [self.trunk]
-
-        # Upsampling
-        self.scale = scale
-        self.upconv1 = nn.Conv2d(nf, nf, 5, 1, padding=2, bias=True)
-        self.upconv2 = nn.Conv2d(nf, nf, 5, 1, padding=2, bias=True)
-        self.HRconv = nn.Conv2d(nf, nf, 5, 1, padding=2, bias=True)
-        self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
+        self.conv_first = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
+        self.body = make_layer(
+            RRDB,
+            num_blocks,
+            mid_channels=mid_channels,
+            growth_channels=growth_channels)
+        self.conv_body = nn.Conv2d(mid_channels, mid_channels, 3, 1, 1)
+        # upsample
+        self.conv_up1 = nn.Conv2d(mid_channels, mid_channels, 3, 1, 1)
+        self.conv_up2 = nn.Conv2d(mid_channels, mid_channels, 3, 1, 1)
+        self.conv_hr = nn.Conv2d(mid_channels, mid_channels, 3, 1, 1)
+        self.conv_last = nn.Conv2d(mid_channels, out_channels, 3, 1, 1)
 
         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
 
-    def forward(self, x):
-        fea = self.trunk(x)
-
-        if self.scale >= 2:
-            fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv1(fea))
-        if self.scale >= 4:
-            fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv2(fea))
-        out = self.conv_last(self.lrelu(self.HRconv(fea)))
-
-        return out
-
-    def load_state_dict(self, state_dict, strict=True):
-        # The parameters in self.trunk used to be in this class. To support loading legacy saves, restore them.
-        t_state = self.trunk.state_dict()
-        for k in t_state.keys():
-            if k in state_dict.keys():
-                state_dict["trunk.%s" % (k,)] = state_dict.pop(k)
-        super(RRDBNet, self).load_state_dict(state_dict, strict)
-
-
-# Variant of RRDBNet that is "assisted" by an external pretrained image classifier whose
-# intermediate layers have been splayed out, pixel-shuffled, and fed back in.
-# TODO: Convert to use new RRDBBase hierarchy.
-class AssistedRRDBNet(nn.Module):
-    # in_nc=number of input channels.
-    # out_nc=number of output channels.
-    # nf=internal filter count
-    # nb=number of additional blocks after the assistance layers.
-    # gc=growth channel inside of residual blocks
-    # scale=the number of times the output is doubled in size.
-    # initial_stride=the stride on the first conv. can be used to downsample the image for processing.
-    def __init__(self, in_nc, out_nc, nf, nb, gc=32, scale=2, initial_stride=1):
-        super(AssistedRRDBNet, self).__init__()
-        self.scale = scale
-        self.conv_first = nn.Conv2d(in_nc, nf, 7, initial_stride, padding=3, bias=True)
-
-        # Set-up the assist-net, which should do feature extraction for us.
-        self.assistnet = torchvision.models.wide_resnet50_2(pretrained=True)
-        self.set_enable_assistnet_training(False)
-        assist_nf = [4, 8, 16]  # Fixed for resnet. Re-evaluate if using other networks.
-        self.assist2 = RRDB(nf + assist_nf[0], gc)
-        self.assist3 = RRDB(nf + sum(assist_nf[:2]), gc)
-        self.assist4 = RRDB(nf + sum(assist_nf), gc)
-        nf = nf + sum(assist_nf)
-
-        # After this, it's just a "standard" RRDB net.
-        RRDB_block_f = functools.partial(RRDB, nf=nf, gc=gc)
-        self.RRDB_trunk = arch_util.make_layer(RRDB_block_f, nb)
-        self.trunk_conv = nn.Conv2d(nf, nf, 3, 1, 1, bias=True)
-        #### upsampling
-        self.upconv1 = nn.Conv2d(nf, nf, 5, 1, padding=2, bias=True)
-        self.upconv2 = nn.Conv2d(nf, nf, 5, 1, padding=2, bias=True)
-        self.HRconv = nn.Conv2d(nf, nf, 5, 1, padding=2, bias=True)
-        self.conv_last = nn.Conv2d(nf, out_nc, 3, 1, 1, bias=True)
-
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-
-    def set_enable_assistnet_training(self, en):
-        for p in self.assistnet.parameters():
-            p.requires_grad = en
-
-    def res_extract(self, x):
-        # Width and height must be factors of 16 to use this architecture. Check that here.
-        (b, f, w, h) = x.shape
-        assert w % 16 == 0
-        assert h % 16 == 0
-
-        x = self.assistnet.conv1(x)
-        x = self.assistnet.bn1(x)
-        x = self.assistnet.relu(x)
-        x = self.assistnet.maxpool(x)
-
-        x = self.assistnet.layer1(x)
-        l1 = F.pixel_shuffle(x, 4)
-        x = self.assistnet.layer2(x)
-        l2 = F.pixel_shuffle(x, 8)
-        x = self.assistnet.layer3(x)
-        l3 = F.pixel_shuffle(x, 16)
-        return l1, l2, l3
+        for m in [
+            self.conv_first, self.conv_body, self.conv_up1,
+            self.conv_up2, self.conv_hr, self.conv_last
+        ]:
+            default_init_weights(m, 0.1)
 
     def forward(self, x):
-        # Invoke the assistant net first.
-        l1, l2, l3 = self.res_extract(x)
+        """Forward function.
 
-        fea = self.conv_first(x)
-        fea = self.assist2(torch.cat([fea, l3], dim=1))
-        fea = self.assist3(torch.cat([fea, l2], dim=1))
-        fea = self.assist4(torch.cat([fea, l1], dim=1))
+        Args:
+            x (Tensor): Input tensor with shape (n, c, h, w).
 
-        trunk = self.trunk_conv(self.RRDB_trunk(fea))
-        fea = fea + trunk
+        Returns:
+            Tensor: Forward results.
+        """
 
-        if self.scale >= 2:
-            fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv1(fea))
-        if self.scale >= 4:
-            fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv2(fea))
-        out = self.conv_last(self.lrelu(self.HRconv(fea)))
-
-        return (out,)
-
-
-class PixShuffleInitialConv(nn.Module):
-    def __init__(self, reduction_factor, nf_out):
-        super(PixShuffleInitialConv, self).__init__()
-        self.conv = nn.Conv2d(3 * (reduction_factor ** 2), nf_out, 1)
-        self.unshuffle = PixelUnshuffle(reduction_factor)
-
-    def forward(self, x):
-        (b, f, w, h) = x.shape
-        # This module can only be applied to input images (with 3 channels)
-        assert f == 3
-
-        x = self.unshuffle(x)
-        return self.conv(x)
-
-
-# This class uses a RRDBTrunk to perform processing on an image, then upsamples it.
-class PixShuffleRRDB(RRDBBase):
-    def __init__(self, nf, nb, gc=32, scale=2, rrdb_block_f=None):
-        super(PixShuffleRRDB, self).__init__()
-
-        # This class does a 4x pixel shuffle on the filter count inside the trunk, so nf must be divisible by 16.
-        assert nf % 16 == 0
-
-        # Trunk - does actual processing.
-        self.trunk = RRDBTrunk(3, nf, nb, gc, 1, rrdb_block_f, PixShuffleInitialConv(4, nf))
-        self.trunks = [self.trunk]
-
-        # Upsampling
-        pix_nf = int(nf/16)
-        self.scale = scale
-        self.upconv1 = nn.Conv2d(pix_nf, pix_nf, 5, 1, padding=2, bias=True)
-        self.upconv2 = nn.Conv2d(pix_nf, pix_nf, 5, 1, padding=2, bias=True)
-        self.HRconv = nn.Conv2d(pix_nf, pix_nf, 5, 1, padding=2, bias=True)
-        self.conv_last = nn.Conv2d(pix_nf, 3, 3, 1, 1, bias=True)
-        self.pixel_shuffle = nn.PixelShuffle(4)
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-
-    def forward(self, x):
-        fea = self.trunk(x)
-        fea = self.pixel_shuffle(fea)
-
-        if self.scale >= 2:
-            fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv1(fea))
-        if self.scale >= 4:
-            fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv2(fea))
-        out = self.conv_last(self.lrelu(self.HRconv(fea)))
-
-        return (out,)
\ No newline at end of file
+        feat = self.conv_first(x)
+        body_feat = self.conv_body(checkpoint_sequential(self.body, 5, feat))
+        feat = feat + body_feat
+        # upsample
+        feat = self.lrelu(
+            self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
+        feat = self.lrelu(
+            self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
+        out = self.conv_last(self.lrelu(self.conv_hr(feat)))
+        return out
\ No newline at end of file
diff --git a/codes/models/archs/arch_util.py b/codes/models/archs/arch_util.py
index 04d49ee0..ba2e2abd 100644
--- a/codes/models/archs/arch_util.py
+++ b/codes/models/archs/arch_util.py
@@ -5,6 +5,22 @@ import torch.nn.functional as F
 import torch.nn.utils.spectral_norm as SpectralNorm
 from math import sqrt
 
+def kaiming_init(module,
+                 a=0,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(
+            module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(
+            module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
 def pixel_norm(x, epsilon=1e-8):
     return x * torch.rsqrt(torch.mean(torch.pow(x, 2), dim=1, keepdims=True) + epsilon)
 
@@ -28,14 +44,34 @@ def initialize_weights(net_l, scale=1):
                 init.constant_(m.bias.data, 0.0)
 
 
-def make_layer(block, n_layers, return_layers=False):
+def make_layer(block, num_blocks, **kwarg):
+    """Make layers by stacking the same blocks.
+    Args:
+        block (nn.module): nn.module class for basic block.
+        num_blocks (int): number of blocks.
+    Returns:
+        nn.Sequential: Stacked blocks in nn.Sequential.
+    """
     layers = []
-    for _ in range(n_layers):
-        layers.append(block())
-    if return_layers:
-        return nn.Sequential(*layers), layers
-    else:
-        return nn.Sequential(*layers)
+    for _ in range(num_blocks):
+        layers.append(block(**kwarg))
+    return nn.Sequential(*layers)
+
+
+def default_init_weights(module, scale=1):
+    """Initialize network weights.
+    Args:
+        modules (nn.Module): Modules to be initialized.
+        scale (float): Scale initialized weights, especially for residual
+            blocks.
+    """
+    for m in module.modules():
+        if isinstance(m, nn.Conv2d):
+            kaiming_init(m, a=0, mode='fan_in', bias=0)
+            m.weight.data *= scale
+        elif isinstance(m, nn.Linear):
+            kaiming_init(m, a=0, mode='fan_in', bias=0)
+            m.weight.data *= scale
 
 
 class ResidualBlock(nn.Module):
diff --git a/codes/models/base_model.py b/codes/models/base_model.py
index ea08aecc..be942956 100644
--- a/codes/models/base_model.py
+++ b/codes/models/base_model.py
@@ -110,6 +110,8 @@ class BaseModel():
         for k, v in load_net.items():
             if k.startswith('module.'):
                 load_net_clean[k[7:]] = v
+            if k.startswith('generator'):   # Hack to fix ESRGAN pretrained model.
+                load_net_clean[k[10:]] = v
             else:
                 load_net_clean[k] = v
         network.load_state_dict(load_net_clean, strict=strict)
diff --git a/codes/models/networks.py b/codes/models/networks.py
index 8423fb7f..fbb67440 100644
--- a/codes/models/networks.py
+++ b/codes/models/networks.py
@@ -36,14 +36,8 @@ def define_G(opt, net_key='network_G', scale=None):
         netG = SRResNet_arch.MSRResNet(in_nc=opt_net['in_nc'], out_nc=opt_net['out_nc'],
                                        nf=opt_net['nf'], nb=opt_net['nb'], upscale=opt_net['scale'])
     elif which_model == 'RRDBNet':
-        # RRDB does scaling in two steps, so take the sqrt of the scale we actually want to achieve and feed it to RRDB.
-        initial_stride = 1 if 'initial_stride' not in opt_net else opt_net['initial_stride']
-        assert initial_stride == 1 or initial_stride == 2
-        # Need to adjust the scale the generator sees by the stride since the stride causes a down-sample.
-        gen_scale = scale * initial_stride
-        netG = RRDBNet_arch.RRDBNet(in_nc=opt_net['in_nc'], out_nc=opt_net['out_nc'],
-                                    nf=opt_net['nf'], nb=opt_net['nb'], scale=opt_net['scale'] if 'scale' in opt_net.keys() else gen_scale,
-                                    initial_stride=initial_stride)
+        netG = RRDBNet_arch.RRDBNet(in_channels=opt_net['in_nc'], out_channels=opt_net['out_nc'],
+                                    mid_channels=opt_net['nf'], num_blocks=opt_net['nb'])
     elif which_model == 'rcan':
         #args: n_resgroups, n_resblocks, res_scale, reduction, scale, n_feats
         opt_net['rgb_range'] = 255