diff --git a/codes/models/archs/RRDBNet_arch.py b/codes/models/archs/RRDBNet_arch.py
index 558c24c9..7be467d5 100644
--- a/codes/models/archs/RRDBNet_arch.py
+++ b/codes/models/archs/RRDBNet_arch.py
@@ -3,19 +3,20 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import models.archs.arch_util as arch_util
+from models.archs.arch_util import PixelUnshuffle
 import torchvision
 import switched_conv as switched_conv
 
 
 class ResidualDenseBlock_5C(nn.Module):
-    def __init__(self, nf=64, gc=32, bias=True):
+    def __init__(self, nf=64, gc=32, bias=True, late_stage_kernel_size=3, late_stage_padding=1):
         super(ResidualDenseBlock_5C, self).__init__()
         # gc: growth channel, i.e. intermediate channels
         self.conv1 = nn.Conv2d(nf, gc, 3, 1, 1, bias=bias)
         self.conv2 = nn.Conv2d(nf + gc, gc, 3, 1, 1, bias=bias)
-        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, 3, 1, 1, bias=bias)
-        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, 3, 1, 1, bias=bias)
-        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, 3, 1, 1, bias=bias)
+        self.conv3 = nn.Conv2d(nf + 2 * gc, gc, late_stage_kernel_size, 1, late_stage_padding, bias=bias)
+        self.conv4 = nn.Conv2d(nf + 3 * gc, gc, late_stage_kernel_size, 1, late_stage_padding, bias=bias)
+        self.conv5 = nn.Conv2d(nf + 4 * gc, nf, late_stage_kernel_size, 1, late_stage_padding, bias=bias)
         self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
 
         # initialization
@@ -32,9 +33,15 @@ class ResidualDenseBlock_5C(nn.Module):
 
 
 # Multiple 5-channel residual block that uses learned switching to diversify its outputs.
+# If multi_head_input=False: takes standard (b,f,w,h) input tensor; else takes (b,heads,f,w,h) input tensor. Note that the default RDB block does not support this format, so use SwitchedRDB_5C_MultiHead for this case.
+# If collapse_heads=True, outputs (b,f,w,h) tensor.
+# If collapse_heads=False, outputs (b,heads,f,w,h) tensor.
 class SwitchedRDB_5C(switched_conv.MultiHeadSwitchedAbstractBlock):
-    def __init__(self, nf=64, gc=32, num_convs=8, num_heads=2, init_temperature=1):
-        rdb5c = functools.partial(ResidualDenseBlock_5C, nf, gc)
+    def __init__(self, nf=64, gc=32, num_convs=8, num_heads=2, init_temperature=1, multi_head_input=False, collapse_heads=True, force_block=None):
+        if force_block is None:
+            rdb5c = functools.partial(ResidualDenseBlock_5C, nf, gc)
+        else:
+            rdb5c = force_block
         super(SwitchedRDB_5C, self).__init__(
             rdb5c,
             nf,
@@ -43,22 +50,70 @@ class SwitchedRDB_5C(switched_conv.MultiHeadSwitchedAbstractBlock):
             att_kernel_size=3,
             att_pads=1,
             initial_temperature=init_temperature,
+            multi_head_input=multi_head_input,
+            concat_heads_into_filters=collapse_heads,
         )
-        self.mhead_collapse = nn.Conv2d(num_heads * nf, nf, 1)
+        self.collapse_heads = collapse_heads
+        if self.collapse_heads:
+            self.mhead_collapse = nn.Conv2d(num_heads * nf, nf, 1)
+            arch_util.initialize_weights([self.mhead_collapse], 1)
 
         arch_util.initialize_weights([sw.attention_conv1 for sw in self.switches] +
-                                     [sw.attention_conv2 for sw in self.switches] +
-                                     [self.mhead_collapse], 1)
+                                     [sw.attention_conv2 for sw in self.switches], 1)
 
     def forward(self, x, output_attention_weights=False):
         outs = super(SwitchedRDB_5C, self).forward(x, output_attention_weights)
         if output_attention_weights:
             outs, atts = outs
-        # outs need to be collapsed back down to a single heads worth of data.
-        out = self.mhead_collapse(outs)
+
+        if self.collapse_heads:
+            # outs need to be collapsed back down to a single heads worth of data.
+            out = self.mhead_collapse(outs)
+        else:
+            out = outs
+
         return out, atts
 
 
+# Implementation of ResidualDenseBlock_5C which compresses multiple switching heads via a Conv3d before doing RDB
+# computation.
+class ResidualDenseBlock_5C_WithMheadConverter(ResidualDenseBlock_5C):
+    def __init__(self, nf=64, gc=32, bias=True, heads=2):
+        # Switched blocks generally operate at low resolution, kernel size is much less important, therefore set to 1.
+        super(ResidualDenseBlock_5C_WithMheadConverter, self).__init__(nf=nf, gc=gc, bias=bias, late_stage_kernel_size=1,
+                                                                       late_stage_padding=0)
+        self.heads = heads
+        self.converter = nn.Conv3d(nf, nf, kernel_size=(heads, 1, 1), stride=(heads, 1, 1))
+
+    # Accepts input of shape (b, heads, f, w, h)
+    def forward(self, x):
+        # Permute filter dim to 1.
+        x = x.permute(0, 2, 1, 3, 4)
+        x = self.converter(x)
+        x = torch.squeeze(x, dim=2)
+        return super(ResidualDenseBlock_5C_WithMheadConverter, self).forward(x)
+
+
+# Multiple 5-channel residual block that uses learned switching to diversify its outputs. The difference between this
+# block and SwitchedRDB_5C is this block accepts multi-headed inputs of format (b,heads,f,w,h).
+#
+# It does this by performing a Conv3d on the first block, which convolves all heads and collapses them to a dimension
+# of 1. The tensor is then squeezed and performs identically to SwitchedRDB_5C from there.
+class SwitchedRDB_5C_MultiHead(SwitchedRDB_5C):
+    def __init__(self, nf=64, gc=32, num_convs=8, num_heads=2, init_temperature=1, collapse_heads=False):
+        rdb5c = functools.partial(ResidualDenseBlock_5C_WithMheadConverter, nf, gc, heads=num_heads)
+        super(SwitchedRDB_5C_MultiHead, self).__init__(
+            nf=nf,
+            gc=gc,
+            num_convs=num_convs,
+            num_heads=num_heads,
+            init_temperature=init_temperature,
+            multi_head_input=True,
+            collapse_heads=collapse_heads,
+            force_block=rdb5c,
+        )
+
+
 class RRDB(nn.Module):
     '''Residual in Residual Dense Block'''
 
@@ -74,13 +129,26 @@ class RRDB(nn.Module):
         out = self.RDB3(out)
         return out * 0.2 + x
 
+
+class LowDimRRDB(RRDB):
+    def __init__(self, nf, gc=32, dimensional_adjustment=4):
+        super(LowDimRRDB, self).__init__(nf * (dimensional_adjustment ** 2), gc * (dimensional_adjustment ** 2))
+        self.unshuffle = PixelUnshuffle(dimensional_adjustment)
+        self.shuffle = nn.PixelShuffle(dimensional_adjustment)
+
+    def forward(self, x):
+        x = self.unshuffle(x)
+        x = super(LowDimRRDB, self).forward(x)
+        return self.shuffle(x)
+
+
 # RRDB block that uses switching on the individual RDB modules that compose it to increase learning diversity.
 class SwitchedRRDB(RRDB):
-    def __init__(self, nf, gc=32, num_convs=8, init_temperature=1, final_temperature_step=1):
-        super(RRDB, self).__init__()
-        self.RDB1 = SwitchedRDB_5C(nf, gc, num_convs=num_convs, init_temperature=init_temperature)
-        self.RDB2 = SwitchedRDB_5C(nf, gc, num_convs=num_convs, init_temperature=init_temperature)
-        self.RDB3 = SwitchedRDB_5C(nf, gc, num_convs=num_convs, init_temperature=init_temperature)
+    def __init__(self, nf, gc=32, num_convs=8, init_temperature=1, final_temperature_step=1, switching_block=SwitchedRDB_5C):
+        super(SwitchedRRDB, self).__init__(nf, gc)
+        self.RDB1 = switching_block(nf, gc, num_convs=num_convs, init_temperature=init_temperature)
+        self.RDB2 = switching_block(nf, gc, num_convs=num_convs, init_temperature=init_temperature)
+        self.RDB3 = switching_block(nf, gc, num_convs=num_convs, init_temperature=init_temperature)
         self.init_temperature = init_temperature
         self.final_temperature_step = final_temperature_step
         self.running_mean = 0
@@ -116,6 +184,53 @@ class SwitchedRRDB(RRDB):
         self.running_mean = 0
         return val
 
+
+# Identical to LowDimRRDB but wraps an RRDB rather than inheriting from it. TODO: remove LowDimRRDB when backwards
+# compatibility is no longer desired.
+class LowDimRRDBWrapper(nn.Module):
+    # Do not specify nf or gc on the partial_rrdb passed in. That will be done by the wrapper.
+    def __init__(self, nf, partial_rrdb, gc=32, dimensional_adjustment=4):
+        super(LowDimRRDBWrapper, self).__init__()
+        self.rrdb = partial_rrdb(nf=nf * (dimensional_adjustment ** 2), gc=gc * (dimensional_adjustment ** 2))
+        self.unshuffle = PixelUnshuffle(dimensional_adjustment)
+        self.shuffle = nn.PixelShuffle(dimensional_adjustment)
+
+    def forward(self, x):
+        x = self.unshuffle(x)
+        x = self.rrdb(x)
+        return self.shuffle(x)
+
+# RRDB block that uses multi-headed switching on multiple individual RDB blocks to improve diversity. Multiple heads
+# are annealed internally. This variant has a depth of 4 RDB blocks, rather than 3 like others above.
+class SwitchedMultiHeadRRDB(SwitchedRRDB):
+    def __init__(self, nf, gc=32, num_convs=8, num_heads=2, init_temperature=1, final_temperature_step=1):
+        super(SwitchedMultiHeadRRDB, self).__init__(nf=nf, gc=gc, num_convs=num_convs, init_temperature=init_temperature, final_temperature_step=final_temperature_step)
+        self.RDB1 = SwitchedRDB_5C(nf, gc, num_convs=num_convs, num_heads=num_heads, init_temperature=init_temperature, collapse_heads=False)
+        self.RDB2 = SwitchedRDB_5C_MultiHead(nf, gc, num_convs=num_convs, num_heads=num_heads, init_temperature=init_temperature, collapse_heads=False)
+        self.RDB3 = SwitchedRDB_5C_MultiHead(nf, gc, num_convs=num_convs, num_heads=num_heads, init_temperature=init_temperature, collapse_heads=False)
+        self.RDB4 = SwitchedRDB_5C_MultiHead(nf, gc, num_convs=num_convs, num_heads=num_heads, init_temperature=init_temperature, collapse_heads=True)
+
+    def set_temperature(self, temp):
+        [sw.set_attention_temperature(temp) for sw in self.RDB1.switches]
+        [sw.set_attention_temperature(temp) for sw in self.RDB2.switches]
+        [sw.set_attention_temperature(temp) for sw in self.RDB3.switches]
+        [sw.set_attention_temperature(temp) for sw in self.RDB4.switches]
+
+    def forward(self, x):
+        out, att1 = self.RDB1(x, True)
+        out, att2 = self.RDB2(out, True)
+        out, att3 = self.RDB3(out, True)
+        out, att4 = self.RDB4(out, True)
+
+        a1mean, _ = switched_conv.compute_attention_specificity(att1, 2)
+        a2mean, _ = switched_conv.compute_attention_specificity(att2, 2)
+        a3mean, _ = switched_conv.compute_attention_specificity(att3, 2)
+        a4mean, _ = switched_conv.compute_attention_specificity(att4, 2)
+        self.running_mean += (a1mean + a2mean + a3mean + a4mean) / 3.0
+        self.running_count += 1
+
+        return out * 0.2 + x
+
 # This module performs the majority of the processing done by RRDBNet. It just doesn't have the upsampling at the end.
 class RRDBTrunk(nn.Module):
     def __init__(self, nf_in, nf_out, nb, gc=32, initial_stride=1, rrdb_block_f=None, conv_first_block=None):
@@ -295,21 +410,18 @@ class AssistedRRDBNet(nn.Module):
 
         return (out,)
 
-
 class PixShuffleInitialConv(nn.Module):
     def __init__(self, reduction_factor, nf_out):
         super(PixShuffleInitialConv, self).__init__()
         self.conv = nn.Conv2d(3 * (reduction_factor ** 2), nf_out, 1)
-        self.r = reduction_factor
+        self.unshuffle = PixelUnshuffle(reduction_factor)
 
     def forward(self, x):
         (b, f, w, h) = x.shape
         # This module can only be applied to input images (with 3 channels)
         assert f == 3
-        # Perform a "reverse-pixel-shuffle", reducing the image size and increasing filter count by self.r**2
-        x = x.contiguous().view(b, 3, w // self.r, self.r, h // self.r, self.r)
-        x = x.permute(0, 1, 3, 5, 2, 4).contiguous().view(b, 3 * (self.r ** 2), w // self.r, h // self.r)
-        # Apply the conv to bring the filter account to the desired size.
+
+        x = self.unshuffle(x)
         return self.conv(x)
 
 # This class uses a RRDBTrunk to perform processing on an image, then upsamples it.
@@ -346,44 +458,4 @@ class PixShuffleRRDB(RRDBBase):
         fea = self.lrelu(self.upconv2(fea))
         out = self.conv_last(self.lrelu(self.HRconv(fea)))
 
-        return (out,)
-
-
-# This class uses two RRDB trunks to process an image at different resolution levels.
-class MultiRRDBNet(RRDBBase):
-    def __init__(self, nf_base, gc_base, lo_blocks, hi_blocks, scale=2, rrdb_block_f=None):
-        super(MultiRRDBNet, self).__init__()
-
-        # Chained trunks
-        lo_nf = nf_base * 4
-        lo_nf_out = nf_base // 4
-        hi_nf = nf_base
-        self.lo_trunk = RRDBTrunk(nf_base, lo_nf, lo_blocks, gc_base * 2, initial_stride=1, rrdb_block_f=rrdb_block_f, conv_first_block=PixShuffleInitialConv(4, lo_nf))
-        self.skip_conv = nn.Conv2d(3, lo_nf_out, 1)
-        self.hi_trunk = RRDBTrunk(lo_nf_out, hi_nf, hi_blocks, gc_base, initial_stride=1, rrdb_block_f=rrdb_block_f)
-        self.trunks = [self.lo_trunk, self.hi_trunk]
-
-        # Upsampling
-        self.scale = scale
-        self.upconv1 = nn.Conv2d(hi_nf, hi_nf, 5, 1, padding=2, bias=True)
-        self.upconv2 = nn.Conv2d(hi_nf, hi_nf, 5, 1, padding=2, bias=True)
-        self.HRconv = nn.Conv2d(hi_nf, hi_nf, 5, 1, padding=2, bias=True)
-        self.conv_last = nn.Conv2d(hi_nf, 3, 3, 1, 1, bias=True)
-        self.pixel_shuffle = nn.PixelShuffle(4)
-
-        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-
-    def forward(self, x):
-        fea_lo = self.lo_trunk(x)
-        fea = self.pixel_shuffle(fea_lo) + self.skip_conv(x)
-        fea = self.hi_trunk(fea)
-
-        # Upsampling.
-        fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv1(fea))
-        if self.scale >= 4:
-            fea = F.interpolate(fea, scale_factor=2, mode='nearest')
-        fea = self.lrelu(self.upconv2(fea))
-        out = self.conv_last(self.lrelu(self.HRconv(fea)))
-
         return (out,)
\ No newline at end of file
diff --git a/codes/models/archs/arch_util.py b/codes/models/archs/arch_util.py
index 9062bbf5..ecb7be76 100644
--- a/codes/models/archs/arch_util.py
+++ b/codes/models/archs/arch_util.py
@@ -128,3 +128,16 @@ def flow_warp(x, flow, interp_mode='bilinear', padding_mode='zeros'):
     vgrid_scaled = torch.stack((vgrid_x, vgrid_y), dim=3)
     output = F.grid_sample(x, vgrid_scaled, mode=interp_mode, padding_mode=padding_mode)
     return output
+
+
+class PixelUnshuffle(nn.Module):
+    def __init__(self, reduction_factor):
+        super(PixelUnshuffle, self).__init__()
+        self.r = reduction_factor
+
+    def forward(self, x):
+        (b, f, w, h) = x.shape
+        x = x.contiguous().view(b, f, w // self.r, self.r, h // self.r, self.r)
+        x = x.permute(0, 1, 3, 5, 2, 4).contiguous().view(b, f * (self.r ** 2), w // self.r, h // self.r)
+        return x
+
diff --git a/codes/models/networks.py b/codes/models/networks.py
index 89bfc384..44ef9a94 100644
--- a/codes/models/networks.py
+++ b/codes/models/networks.py
@@ -38,14 +38,17 @@ def define_G(opt, net_key='network_G'):
                                     rrdb_block_f=functools.partial(RRDBNet_arch.SwitchedRRDB, nf=opt_net['nf'], gc=opt_net['gc'],
                                                                    init_temperature=opt_net['temperature'],
                                                                    final_temperature_step=opt_net['temperature_final_step']))
-    elif which_model == 'MultiRRDBNet':
-        block_f = None
-        if opt_net['attention']:
-            block_f = functools.partial(RRDBNet_arch.SwitchedRRDB, nf=opt_net['nf'], gc=opt_net['gc'],
-                                        init_temperature=opt_net['temperature'],
-                                        final_temperature_step=opt_net['temperature_final_step'])
-        netG = RRDBNet_arch.MultiRRDBNet(nf_base=opt_net['nf'], gc_base=opt_net['gc'], lo_blocks=opt_net['lo_blocks'],
-                                         hi_blocks=opt_net['hi_blocks'], scale=scale, rrdb_block_f=block_f)
+    elif which_model == 'LowDimRRDBNet':
+        rrdb = functools.partial(RRDBNet_arch.LowDimRRDB, nf=opt_net['nf'], gc=opt_net['gc'], dimensional_adjustment=opt_net['dim'])
+        netG = RRDBNet_arch.RRDBNet(in_nc=opt_net['in_nc'], out_nc=opt_net['out_nc'],
+                                    nf=opt_net['nf'], nb=opt_net['nb'], scale=scale, rrdb_block_f=rrdb)
+    elif which_model == "LowDimRRDBWithMultiHeadSwitching":
+        switcher = functools.partial(RRDBNet_arch.SwitchedMultiHeadRRDB, num_convs=opt_net['num_convs'], num_heads=opt_net['num_heads'],
+                                 init_temperature=opt_net['temperature'], final_temperature_step=opt_net['temperature_final_step'])
+        rrdb = functools.partial(RRDBNet_arch.LowDimRRDBWrapper, nf=opt_net['nf'], gc=opt_net['gc'], dimensional_adjustment=opt_net['dim'],
+                                 partial_rrdb=switcher)
+        netG = RRDBNet_arch.RRDBNet(in_nc=opt_net['in_nc'], out_nc=opt_net['out_nc'],
+                                    nf=opt_net['nf'], nb=opt_net['nb'], scale=scale, rrdb_block_f=rrdb)
     elif which_model == 'PixRRDBNet':
         block_f = None
         if opt_net['attention']: