Fix SPSR calls into SwitchComputer

This commit is contained in:
James Betker 2020-10-13 10:14:47 -06:00
parent bdf4c38899
commit cc915303a5

View File

@ -271,18 +271,18 @@ class Spsr5(nn.Module):
x = self.model_fea_conv(x) x = self.model_fea_conv(x)
x1 = x x1 = x
x1, a1 = self.sw1(x1, True, identity=x, att_in=(x1, embedding)) x1, a1 = self.sw1(x1, identity=x, att_in=(x1, embedding))
x2 = x1 x2 = x1
x2, nstd = self.noise_ref_join(x2, torch.randn_like(x2)) x2, nstd = self.noise_ref_join(x2, torch.randn_like(x2))
x2, a2 = self.sw2(x2, True, identity=x1, att_in=(x2, embedding)) x2, a2 = self.sw2(x2, identity=x1, att_in=(x2, embedding))
noise_stds.append(nstd) noise_stds.append(nstd)
x_grad = self.grad_conv(x_grad) x_grad = self.grad_conv(x_grad)
x_grad_identity = x_grad x_grad_identity = x_grad
x_grad, nstd = self.noise_ref_join_grad(x_grad, torch.randn_like(x_grad)) x_grad, nstd = self.noise_ref_join_grad(x_grad, torch.randn_like(x_grad))
x_grad, grad_fea_std = self.grad_ref_join(x_grad, x1) x_grad, grad_fea_std = self.grad_ref_join(x_grad, x1)
x_grad, a3 = self.sw_grad(x_grad, True, identity=x_grad_identity, att_in=(x_grad, embedding)) x_grad, a3 = self.sw_grad(x_grad, identity=x_grad_identity, att_in=(x_grad, embedding))
x_grad = self.grad_lr_conv(x_grad) x_grad = self.grad_lr_conv(x_grad)
x_grad = self.grad_lr_conv2(x_grad) x_grad = self.grad_lr_conv2(x_grad)
x_grad_out = self.upsample_grad(x_grad) x_grad_out = self.upsample_grad(x_grad)
@ -292,7 +292,7 @@ class Spsr5(nn.Module):
x_out = x2 x_out = x2
x_out, nstd = self.noise_ref_join_conjoin(x_out, torch.randn_like(x_out)) x_out, nstd = self.noise_ref_join_conjoin(x_out, torch.randn_like(x_out))
x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad) x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad)
x_out, a4 = self.conjoin_sw(x_out, True, identity=x2, att_in=(x_out, embedding)) x_out, a4 = self.conjoin_sw(x_out, identity=x2, att_in=(x_out, embedding))
x_out = self.final_lr_conv(x_out) x_out = self.final_lr_conv(x_out)
x_out = self.upsample(x_out) x_out = self.upsample(x_out)
x_out = self.final_hr_conv1(x_out) x_out = self.final_hr_conv1(x_out)
@ -404,15 +404,15 @@ class Spsr6(nn.Module):
x = self.model_fea_conv(x) x = self.model_fea_conv(x)
x1 = x x1 = x
x1, a1 = self.sw1(x1, True, identity=x) x1, a1 = self.sw1(x1, identity=x)
x2 = x1 x2 = x1
x2, a2 = self.sw2(x2, True, identity=x1) x2, a2 = self.sw2(x2, identity=x1)
x_grad = self.grad_conv(x_grad) x_grad = self.grad_conv(x_grad)
x_grad_identity = x_grad x_grad_identity = x_grad
x_grad, grad_fea_std = self.grad_ref_join(x_grad, x1) x_grad, grad_fea_std = self.grad_ref_join(x_grad, x1)
x_grad, a3 = self.sw_grad(x_grad, True, identity=x_grad_identity) x_grad, a3 = self.sw_grad(x_grad, identity=x_grad_identity)
x_grad = self.grad_lr_conv(x_grad) x_grad = self.grad_lr_conv(x_grad)
x_grad = self.grad_lr_conv2(x_grad) x_grad = self.grad_lr_conv2(x_grad)
x_grad_out = self.upsample_grad(x_grad) x_grad_out = self.upsample_grad(x_grad)
@ -420,7 +420,7 @@ class Spsr6(nn.Module):
x_out = x2 x_out = x2
x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad) x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad)
x_out, a4 = self.conjoin_sw(x_out, True, identity=x2) x_out, a4 = self.conjoin_sw(x_out, identity=x2)
x_out = self.final_lr_conv(x_out) x_out = self.final_lr_conv(x_out)
x_out = checkpoint(self.upsample, x_out) x_out = checkpoint(self.upsample, x_out)
x_out = checkpoint(self.final_hr_conv1, x_out) x_out = checkpoint(self.final_hr_conv1, x_out)
@ -543,15 +543,15 @@ class Spsr7(nn.Module):
x = x + br x = x + br
x1 = x x1 = x
x1, a1 = self.sw1(x1, True, identity=x, att_in=(x1, ref_embedding)) x1, a1 = self.sw1(x1, identity=x, att_in=(x1, ref_embedding), do_checkpointing=True)
x2 = x1 x2 = x1
x2, a2 = self.sw2(x2, True, identity=x1, att_in=(x2, ref_embedding)) x2, a2 = self.sw2(x2, identity=x1, att_in=(x2, ref_embedding), do_checkpointing=True)
x_grad = self.grad_conv(x_grad) x_grad = self.grad_conv(x_grad)
x_grad_identity = x_grad x_grad_identity = x_grad
x_grad, grad_fea_std = checkpoint(self.grad_ref_join, x_grad, x1) x_grad, grad_fea_std = checkpoint(self.grad_ref_join, x_grad, x1)
x_grad, a3 = self.sw_grad(x_grad, True, identity=x_grad_identity, att_in=(x_grad, ref_embedding)) x_grad, a3 = self.sw_grad(x_grad, identity=x_grad_identity, att_in=(x_grad, ref_embedding), do_checkpointing=True)
x_grad = checkpoint(self.grad_lr_conv, x_grad) x_grad = checkpoint(self.grad_lr_conv, x_grad)
x_grad = checkpoint(self.grad_lr_conv2, x_grad) x_grad = checkpoint(self.grad_lr_conv2, x_grad)
x_grad_out = checkpoint(self.upsample_grad, x_grad) x_grad_out = checkpoint(self.upsample_grad, x_grad)
@ -559,7 +559,7 @@ class Spsr7(nn.Module):
x_out = x2 x_out = x2
x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad) x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad)
x_out, a4 = self.conjoin_sw(x_out, True, identity=x2, att_in=(x_out, ref_embedding)) x_out, a4 = self.conjoin_sw(x_out, identity=x2, att_in=(x_out, ref_embedding), do_checkpointing=True)
x_out = checkpoint(self.final_lr_conv, x_out) x_out = checkpoint(self.final_lr_conv, x_out)
x_out = checkpoint(self.upsample, x_out) x_out = checkpoint(self.upsample, x_out)
x_out = checkpoint(self.final_hr_conv1, x_out) x_out = checkpoint(self.final_hr_conv1, x_out)
@ -620,9 +620,9 @@ class AttentionBlock(nn.Module):
def forward(self, x, mplex_ref=None, ref=None): def forward(self, x, mplex_ref=None, ref=None):
if self.ref_join is not None: if self.ref_join is not None:
branch, ref_std = self.ref_join(x, ref) branch, ref_std = self.ref_join(x, ref)
return self.switch(branch, True, identity=x, att_in=(branch, mplex_ref)) + (ref_std,) return self.switch(branch, identity=x, att_in=(branch, mplex_ref)) + (ref_std,)
else: else:
return self.switch(x, True, identity=x, att_in=(x, mplex_ref)) return self.switch(x, identity=x, att_in=(x, mplex_ref))
# SPSR7 with incremental improvements and also using the new AttentionBlock to save gpu memory. # SPSR7 with incremental improvements and also using the new AttentionBlock to save gpu memory.