From cc915303a5433623304d9100953f25db68b24afc Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Tue, 13 Oct 2020 10:14:47 -0600
Subject: [PATCH] Fix SPSR calls into SwitchComputer

---
 codes/models/archs/SPSR_arch.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/codes/models/archs/SPSR_arch.py b/codes/models/archs/SPSR_arch.py
index b822f1a4..b536490e 100644
--- a/codes/models/archs/SPSR_arch.py
+++ b/codes/models/archs/SPSR_arch.py
@@ -271,18 +271,18 @@ class Spsr5(nn.Module):
 
         x = self.model_fea_conv(x)
         x1 = x
-        x1, a1 = self.sw1(x1, True, identity=x, att_in=(x1, embedding))
+        x1, a1 = self.sw1(x1, identity=x, att_in=(x1, embedding))
 
         x2 = x1
         x2, nstd = self.noise_ref_join(x2, torch.randn_like(x2))
-        x2, a2 = self.sw2(x2, True, identity=x1, att_in=(x2, embedding))
+        x2, a2 = self.sw2(x2, identity=x1, att_in=(x2, embedding))
         noise_stds.append(nstd)
 
         x_grad = self.grad_conv(x_grad)
         x_grad_identity = x_grad
         x_grad, nstd = self.noise_ref_join_grad(x_grad, torch.randn_like(x_grad))
         x_grad, grad_fea_std = self.grad_ref_join(x_grad, x1)
-        x_grad, a3 = self.sw_grad(x_grad, True, identity=x_grad_identity, att_in=(x_grad, embedding))
+        x_grad, a3 = self.sw_grad(x_grad, identity=x_grad_identity, att_in=(x_grad, embedding))
         x_grad = self.grad_lr_conv(x_grad)
         x_grad = self.grad_lr_conv2(x_grad)
         x_grad_out = self.upsample_grad(x_grad)
@@ -292,7 +292,7 @@ class Spsr5(nn.Module):
         x_out = x2
         x_out, nstd = self.noise_ref_join_conjoin(x_out, torch.randn_like(x_out))
         x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad)
-        x_out, a4 = self.conjoin_sw(x_out, True, identity=x2, att_in=(x_out, embedding))
+        x_out, a4 = self.conjoin_sw(x_out, identity=x2, att_in=(x_out, embedding))
         x_out = self.final_lr_conv(x_out)
         x_out = self.upsample(x_out)
         x_out = self.final_hr_conv1(x_out)
@@ -404,15 +404,15 @@ class Spsr6(nn.Module):
 
         x = self.model_fea_conv(x)
         x1 = x
-        x1, a1 = self.sw1(x1, True, identity=x)
+        x1, a1 = self.sw1(x1, identity=x)
 
         x2 = x1
-        x2, a2 = self.sw2(x2, True, identity=x1)
+        x2, a2 = self.sw2(x2, identity=x1)
 
         x_grad = self.grad_conv(x_grad)
         x_grad_identity = x_grad
         x_grad, grad_fea_std = self.grad_ref_join(x_grad, x1)
-        x_grad, a3 = self.sw_grad(x_grad, True, identity=x_grad_identity)
+        x_grad, a3 = self.sw_grad(x_grad, identity=x_grad_identity)
         x_grad = self.grad_lr_conv(x_grad)
         x_grad = self.grad_lr_conv2(x_grad)
         x_grad_out = self.upsample_grad(x_grad)
@@ -420,7 +420,7 @@ class Spsr6(nn.Module):
 
         x_out = x2
         x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad)
-        x_out, a4 = self.conjoin_sw(x_out, True, identity=x2)
+        x_out, a4 = self.conjoin_sw(x_out, identity=x2)
         x_out = self.final_lr_conv(x_out)
         x_out = checkpoint(self.upsample, x_out)
         x_out = checkpoint(self.final_hr_conv1, x_out)
@@ -543,15 +543,15 @@ class Spsr7(nn.Module):
             x = x + br
 
         x1 = x
-        x1, a1 = self.sw1(x1, True, identity=x, att_in=(x1, ref_embedding))
+        x1, a1 = self.sw1(x1, identity=x, att_in=(x1, ref_embedding), do_checkpointing=True)
 
         x2 = x1
-        x2, a2 = self.sw2(x2, True, identity=x1, att_in=(x2, ref_embedding))
+        x2, a2 = self.sw2(x2, identity=x1, att_in=(x2, ref_embedding), do_checkpointing=True)
 
         x_grad = self.grad_conv(x_grad)
         x_grad_identity = x_grad
         x_grad, grad_fea_std = checkpoint(self.grad_ref_join, x_grad, x1)
-        x_grad, a3 = self.sw_grad(x_grad, True, identity=x_grad_identity, att_in=(x_grad, ref_embedding))
+        x_grad, a3 = self.sw_grad(x_grad, identity=x_grad_identity, att_in=(x_grad, ref_embedding), do_checkpointing=True)
         x_grad = checkpoint(self.grad_lr_conv, x_grad)
         x_grad = checkpoint(self.grad_lr_conv2, x_grad)
         x_grad_out = checkpoint(self.upsample_grad, x_grad)
@@ -559,7 +559,7 @@ class Spsr7(nn.Module):
 
         x_out = x2
         x_out, fea_grad_std = self.conjoin_ref_join(x_out, x_grad)
-        x_out, a4 = self.conjoin_sw(x_out, True, identity=x2, att_in=(x_out, ref_embedding))
+        x_out, a4 = self.conjoin_sw(x_out, identity=x2, att_in=(x_out, ref_embedding), do_checkpointing=True)
         x_out = checkpoint(self.final_lr_conv, x_out)
         x_out = checkpoint(self.upsample, x_out)
         x_out = checkpoint(self.final_hr_conv1, x_out)
@@ -620,9 +620,9 @@ class AttentionBlock(nn.Module):
     def forward(self, x, mplex_ref=None, ref=None):
         if self.ref_join is not None:
             branch, ref_std = self.ref_join(x, ref)
-            return self.switch(branch, True, identity=x, att_in=(branch, mplex_ref)) + (ref_std,)
+            return self.switch(branch, identity=x, att_in=(branch, mplex_ref)) + (ref_std,)
         else:
-            return self.switch(x, True, identity=x, att_in=(x, mplex_ref))
+            return self.switch(x, identity=x, att_in=(x, mplex_ref))
 
 
 # SPSR7 with incremental improvements and also using the new AttentionBlock to save gpu memory.