diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py
index 61a8f9c7..80cf40da 100644
--- a/codes/data/audio/nv_tacotron_dataset.py
+++ b/codes/data/audio/nv_tacotron_dataset.py
@@ -108,9 +108,9 @@ class TextWavLoader(torch.utils.data.Dataset):
             return {
                 'real_text': text,
                 'padded_text': tseq,
-                'input_lengths': torch.tensor(orig_text_len, dtype=torch.long),
+                'text_lengths': torch.tensor(orig_text_len, dtype=torch.long),
                 'wav': wav,
-                'output_lengths': torch.tensor(orig_output, dtype=torch.long),
+                'wav_lengths': torch.tensor(orig_output, dtype=torch.long),
                 'filenames': path
             }
         return tseq, wav, path, text
@@ -159,9 +159,9 @@ class TextMelCollate():
 
         return {
             'padded_text': text_padded,
-            'input_lengths': input_lengths,
+            'text_lengths': input_lengths,
             'wav': wav_padded,
-            'output_lengths': output_lengths,
+            'wav_lengths': output_lengths,
             'filenames': filenames,
             'real_text': real_text,
         }
@@ -171,14 +171,14 @@ if __name__ == '__main__':
     batch_sz = 32
     params = {
         'mode': 'nv_tacotron',
-        'path': 'E:\\audio\\MozillaCommonVoice\\en\\test.tsv',
+        'path': ['Z:\\bigasr_dataset\\libritts\\test-clean_list.txt'],
         'phase': 'train',
-        'n_workers': 0,
+        'n_workers': 1,
         'batch_size': batch_sz,
-        'fetcher_mode': 'mozilla_cv',
+        'fetcher_mode': ['libritts'],
         'needs_collate': True,
-        #'max_wav_length': 256000,
-        #'max_text_length': 200,
+        'max_wav_length': 256000,
+        'max_text_length': 200,
         'sample_rate': 22050,
     }
     from data import create_dataset, create_dataloader
diff --git a/codes/models/gpt_voice/gpt_asr_hf2.py b/codes/models/gpt_voice/gpt_asr_hf2.py
index 95e78e89..b6866166 100644
--- a/codes/models/gpt_voice/gpt_asr_hf2.py
+++ b/codes/models/gpt_voice/gpt_asr_hf2.py
@@ -202,7 +202,8 @@ class GPT2InferenceModel(GPT2PreTrainedModel):
 
 class GptAsrHf2(nn.Module):
     NUMBER_SYMBOLS = len(symbols)
-    NUMBER_TEXT_TOKENS = NUMBER_SYMBOLS+1
+    START_TOKEN = NUMBER_SYMBOLS
+    NUMBER_TEXT_TOKENS = NUMBER_SYMBOLS+2
 
     def __init__(self, layers=8, model_dim=512, heads=8, max_symbols_per_phrase=800, max_mel_frames=3000, checkpointing=True):
         super().__init__()
@@ -230,7 +231,7 @@ class GptAsrHf2(nn.Module):
 
     def get_logits(self, mel_inputs, text_targets, get_attns=False):
         # Pad front remove last element to set up next token prediction. Pad at front is the "START" token.
-        text_targets = F.pad(text_targets, (1,0), value=self.NUMBER_SYMBOLS)[:, :-1]
+        text_targets = F.pad(text_targets, (1,0), value=self.START_TOKEN)[:, :-1]
         text_emb = self.gpt.get_input_embeddings()(text_targets)
         text_emb = text_emb + self.text_pos_embedding(torch.arange(text_emb.shape[1], device=text_targets.device))
         mel_emb = self.mel_encoder(mel_inputs)
diff --git a/codes/trainer/injectors/spec_augment.py b/codes/trainer/injectors/spec_augment.py
index fa03ec71..620f10af 100644
--- a/codes/trainer/injectors/spec_augment.py
+++ b/codes/trainer/injectors/spec_augment.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import random
+
+import torch
 import torchvision.utils
 
 from trainer.inject import Injector
@@ -34,6 +36,7 @@ def spec_augment(mel_spectrogram, frequency_masking_para=27, time_masking_para=7
 
     return mel_spectrogram
 
+
 class MelMaskInjector(Injector):
     def __init__(self, opt, env):
         super().__init__(opt, env)
@@ -54,7 +57,8 @@ def visualization_spectrogram(spec, title):
     spec = ((spec + 1) / 2).clip(0, 1)
     torchvision.utils.save_image(spec, f'{title}.png')
 
-if __name__ == '__main__':
+
+def test_mel_mask():
     from data.audio.unsupervised_audio_dataset import load_audio
     from trainer.injectors.base_injectors import MelSpectrogramInjector
     spec_maker = MelSpectrogramInjector({'in': 'audio', 'out': 'spec'}, {})
@@ -63,3 +67,57 @@ if __name__ == '__main__':
     visualization_spectrogram(s, 'original spec')
     saug = spec_augment(s, 50, 5, 1, 3)
     visualization_spectrogram(saug, 'modified spec')
+
+
+'''
+Crafty bespoke injector that is used when training ASR models to create longer sequences to ensure that the entire
+input length embedding is trained. Does this by concatenating every other batch element together to create longer
+sequences which (theoretically) use similar amounts of GPU memory.
+'''
+class CombineMelInjector(Injector):
+    def __init__(self, opt, env):
+        super().__init__(opt, env)
+        self.audio_key = opt['audio_key']
+        self.text_key = opt['text_key']
+        self.audio_lengths = opt['audio_lengths_key']
+        self.text_lengths = opt['text_lengths_key']
+        from models.tacotron2.text import symbols
+        self.text_separator = len(symbols)+1  # Probably need to allow this to be set by user.
+
+    def forward(self, state):
+        audio = state[self.audio_key]
+        texts = state[self.text_key]
+        audio_lengths = state[self.audio_lengths]
+        text_lengths = state[self.text_lengths]
+        assert audio.shape[0] % 2 == 0  # Make sure there are an even number of batches.
+        combined_audios = []
+        combined_texts = []
+        for b in range(audio.shape[0]//2):
+            a1 = audio[b*2, :audio_lengths[b*2]]
+            a2 = audio[b*2+1, :audio_lengths[b*2+1]]
+            a = torch.cat([a1, a2], dim=0)
+            a = torch.nn.functional.pad(a, (0, audio.shape[-1]*2-a.shape[-1]))
+            combined_audios.append(a)
+
+            t1 = texts[b*2, :text_lengths[b*2]]
+            t1 = torch.nn.functional.pad(t1, (0, 1), value=self.text_separator)
+            t2 = texts[b*2+1, :text_lengths[b*2+1]]
+            t = torch.cat([t1, t2], dim=0)
+            t = torch.nn.functional.pad(t, (0, texts.shape[-1]*2-t.shape[-1]))
+            combined_texts.append(t)
+        return {self.audio_key: torch.stack(combined_audios, dim=0),
+                self.text_key: torch.stack(combined_texts, dim=0)}
+
+
+def test_mel_injector():
+    inj = CombineMelInjector({'audio_key': 'a', 'text_key': 't', 'audio_lengths_key': "alk", 'text_lengths_key': 'tlk'}, {})
+    a = torch.rand((4, 22000))
+    al = torch.tensor([11000,14000,22000,20000])
+    t = torch.randint(0, 120, (4, 250))
+    tl = torch.tensor([100,120,200,250])
+    rs = inj({'a': a, 't': t, 'alk': al, 'tlk': tl})
+
+
+
+if __name__ == '__main__':
+    test_mel_injector()
\ No newline at end of file