diff --git a/codes/data/audio/fast_paired_dataset.py b/codes/data/audio/fast_paired_dataset.py
index 8839159f..e968bc45 100644
--- a/codes/data/audio/fast_paired_dataset.py
+++ b/codes/data/audio/fast_paired_dataset.py
@@ -260,7 +260,7 @@ class FastPairedVoiceDebugger:
 
 
 if __name__ == '__main__':
-    batch_sz = 256
+    batch_sz = 16
     params = {
         'mode': 'fast_paired_voice_audio',
         'path': ['y:/libritts/train-other-500/transcribed-oco.tsv',
@@ -268,20 +268,19 @@ if __name__ == '__main__':
            'y:/libritts/train-clean-360/transcribed-oco.tsv',
            'y:/clips/books1/transcribed-w2v.tsv',
            'y:/clips/books2/transcribed-w2v.tsv',
-	       'y:/bigasr_dataset/hifi_tts/transcribed-w2v.tsv'],
+	       'y:/bigasr_dataset/hifi_tts/transcribed-w2v.tsv',
+	       'y:/clips/podcasts-1/transcribed-oco.tsv',],
         'phase': 'train',
         'n_workers': 0,
         'batch_size': batch_sz,
-        'max_wav_length': 163840,
-        'max_text_length': 200,
+        'max_wav_length': 220500,
+        'max_text_length': 500,
         'sample_rate': 22050,
         'load_conditioning': True,
-        'num_conditioning_candidates': 1,
-        'conditioning_length': 66000,
-        'use_bpe_tokenizer': False,
+        'num_conditioning_candidates': 2,
+        'conditioning_length': 102400,
+        'use_bpe_tokenizer': True,
         'load_aligned_codes': False,
-        'needs_collate': False,
-        'produce_ctc_metadata': False,
     }
     from data import create_dataset, create_dataloader
 
@@ -302,6 +301,8 @@ if __name__ == '__main__':
             #max_repeats = max(max_repeats, b['ctc_repeats'].max())
             print(f'{i} {ib} {b["real_text"][ib]}')
             save(b, i, ib, 'wav')
+            save(b, i, ib, 'conditioning', 0)
+            save(b, i, ib, 'conditioning', 1)
             pass
         if i > 15:
             break
diff --git a/codes/models/audio/tts/unet_diffusion_tts_flat0.py b/codes/models/audio/tts/unet_diffusion_tts_flat0.py
index 77fc0260..c56903c0 100644
--- a/codes/models/audio/tts/unet_diffusion_tts_flat0.py
+++ b/codes/models/audio/tts/unet_diffusion_tts_flat0.py
@@ -4,12 +4,9 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import autocast
-from x_transformers import Encoder
-from x_transformers.x_transformers import RelativePositionBias
 
 from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear
 from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, TimestepBlock
-from models.audio.tts.mini_encoder import AudioMiniEncoder
 from trainer.networks import register_model
 from utils.util import checkpoint
 
@@ -189,7 +186,7 @@ class DiffusionTtsFlat(nn.Module):
         }
         return groups
 
-    def forward(self, x, timesteps, aligned_conditioning, conditioning_input, lr_input=None, conditioning_free=False):
+    def forward(self, x, timesteps, aligned_conditioning, conditioning_input, conditioning_free=False):
         """
         Apply the model to an input batch.
 
@@ -197,7 +194,6 @@ class DiffusionTtsFlat(nn.Module):
         :param timesteps: a 1-D batch of timesteps.
         :param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
         :param conditioning_input: a full-resolution audio clip that is used as a reference to the style you want decoded.
-        :param lr_input: for super-sampling models, a guidance audio clip at a lower sampling rate.
         :param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
         :return: an [N x C x ...] Tensor of outputs.
         """
diff --git a/codes/trainer/injectors/audio_injectors.py b/codes/trainer/injectors/audio_injectors.py
index cd9adaaf..24a83bee 100644
--- a/codes/trainer/injectors/audio_injectors.py
+++ b/codes/trainer/injectors/audio_injectors.py
@@ -132,7 +132,7 @@ class DiscreteTokenInjector(Injector):
         super().__init__(opt, env)
         cfg = opt_get(opt, ['dvae_config'], "../experiments/train_diffusion_vocoder_22k_level.yml")
         dvae_name = opt_get(opt, ['dvae_name'], 'dvae')
-        self.dvae = load_model_from_config(cfg, dvae_name, device=env['device']).eval()
+        self.dvae = load_model_from_config(cfg, dvae_name, device=f'cuda:{env["device"]}').eval()
 
     def forward(self, state):
         inp = state[self.input]