diff --git a/codes/data/audio/paired_voice_audio_dataset.py b/codes/data/audio/paired_voice_audio_dataset.py index 3625b3ef..d6123229 100644 --- a/codes/data/audio/paired_voice_audio_dataset.py +++ b/codes/data/audio/paired_voice_audio_dataset.py @@ -106,7 +106,7 @@ class TextWavLoader(torch.utils.data.Dataset): random.shuffle(self.audiopaths_and_text) self.max_wav_len = opt_get(hparams, ['max_wav_length'], None) if self.max_wav_len is not None: - self.max_aligned_codes = self.max_wav_len / self.aligned_codes_to_audio_ratio + self.max_aligned_codes = self.max_wav_len // self.aligned_codes_to_audio_ratio self.max_text_len = opt_get(hparams, ['max_text_length'], None) assert self.max_wav_len is not None and self.max_text_len is not None self.use_bpe_tokenizer = opt_get(hparams, ['use_bpe_tokenizer'], True) @@ -239,7 +239,7 @@ if __name__ == '__main__': 'num_conditioning_candidates': 2, 'conditioning_length': 44000, 'use_bpe_tokenizer': True, - 'load_aligned_codes': False, + 'load_aligned_codes': True, } from data import create_dataset, create_dataloader diff --git a/codes/models/gpt_voice/unet_diffusion_tts.py b/codes/models/gpt_voice/unet_diffusion_tts.py index 4b20e0da..5f2d3cc6 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts.py +++ b/codes/models/gpt_voice/unet_diffusion_tts.py @@ -46,8 +46,7 @@ class DiffusionTts(nn.Module): model_channels, in_channels=1, num_tokens=30, - out_channels=2, # mean and variance - discrete_codes=512, + out_channels=2, # mean and variancexs dropout=0, # res 1, 2, 4, 8,16,32,64,128,256,512, 1K, 2K channel_mult= (1,1.5,2, 3, 4, 6, 8, 12, 16, 24, 32, 48), @@ -68,7 +67,6 @@ class DiffusionTts(nn.Module): kernel_size=3, scale_factor=2, conditioning_inputs_provided=True, - conditioning_input_dim=80, time_embed_dim_multiplier=4, only_train_dvae_connection_layers=False, ):