diff --git a/codes/data/audio/unsupervised_audio_dataset.py b/codes/data/audio/unsupervised_audio_dataset.py index a89a1c1e..d95afe05 100644 --- a/codes/data/audio/unsupervised_audio_dataset.py +++ b/codes/data/audio/unsupervised_audio_dataset.py @@ -10,7 +10,7 @@ from audio2numpy import open_audio from tqdm import tqdm from data.util import find_files_of_type, is_audio_file, load_paths_from_cache -from models.audio.tts.tacotron2 import load_wav_to_torch +from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch from utils.util import opt_get diff --git a/codes/data/audio/wav_aug.py b/codes/data/audio/wav_aug.py index e935aa2e..cc21b972 100644 --- a/codes/data/audio/wav_aug.py +++ b/codes/data/audio/wav_aug.py @@ -3,7 +3,7 @@ import random import torch import torchaudio.sox_effects -from models.audio.tts.tacotron2 import load_wav_to_torch +from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch # Returns random double on [l,h] as a string diff --git a/codes/models/audio/tts/tacotron2/__init__.py b/codes/models/audio/tts/tacotron2/__init__.py index e69de29b..feeb8381 100644 --- a/codes/models/audio/tts/tacotron2/__init__.py +++ b/codes/models/audio/tts/tacotron2/__init__.py @@ -0,0 +1,5 @@ +from models.audio.tts.tacotron2.taco_utils import * +from models.audio.tts.tacotron2.text import * +from models.audio.tts.tacotron2.tacotron2 import * +from models.audio.tts.tacotron2.stft import * +from models.audio.tts.tacotron2.layers import * \ No newline at end of file diff --git a/codes/models/audio/vocoders/univnet/generator.py b/codes/models/audio/vocoders/univnet/generator.py index a54f7f91..c469cc75 100644 --- a/codes/models/audio/vocoders/univnet/generator.py +++ b/codes/models/audio/vocoders/univnet/generator.py @@ -84,19 +84,15 @@ class UnivNetGenerator(nn.Module): def inference(self, c, z=None): # pad input mel with zeros to cut artifact # see https://github.com/seungwonpark/melgan/issues/8 - zero = torch.full((1, self.mel_channel, 10), -11.5129).to(c.device) + zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device) mel = torch.cat((c, zero), dim=2) if z is None: - z = torch.randn(1, self.noise_dim, mel.size(2)).to(mel.device) + z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device) audio = self.forward(mel, z) - audio = audio.squeeze() # collapse all dimension except time axis - audio = audio[:-(self.hop_length * 10)] - audio = MAX_WAV_VALUE * audio - audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1) - audio = audio.short() - + audio = audio[:, :, :-(self.hop_length * 10)] + audio = audio.clamp(min=-1, max=1) return audio diff --git a/codes/scripts/audio/test_audio_similarity.py b/codes/scripts/audio/test_audio_similarity.py index 8c48a0e6..a68332e2 100644 --- a/codes/scripts/audio/test_audio_similarity.py +++ b/codes/scripts/audio/test_audio_similarity.py @@ -5,7 +5,7 @@ import torch.nn.functional as F from data.util import is_wav_file, find_files_of_type from models.audio_resnet import resnet50 -from models.audio.tts.tacotron2 import load_wav_to_torch +from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict if __name__ == '__main__': diff --git a/codes/trainer/eval/audio_diffusion_fid.py b/codes/trainer/eval/audio_diffusion_fid.py index 294656f8..196a642c 100644 --- a/codes/trainer/eval/audio_diffusion_fid.py +++ b/codes/trainer/eval/audio_diffusion_fid.py @@ -125,20 +125,21 @@ class AudioDiffusionFid(evaluator.Evaluator): real_resampled = torchaudio.functional.resample(audio, 22050, SAMPLE_RATE).unsqueeze(0) univnet_mel = wav_to_univnet_mel(audio) # to be used for a conditioning input - output_size = real_resampled.shape[-1] + output_size = univnet_mel.shape[-1] aligned_codes_compression_factor = output_size // mel_codes.shape[-1] - padded_size = ceil_multiple(output_size, 2048) + padded_size = ceil_multiple(output_size, self.model.alignment_size) padding_added = padded_size - output_size padding_needed_for_codes = padding_added // aligned_codes_compression_factor if padding_needed_for_codes > 0: mel_codes = F.pad(mel_codes, (0, padding_needed_for_codes)) - output_shape = (1, 1, padded_size) + output_shape = (1, 100, padded_size) gen_mel = self.diffuser.p_sample_loop(self.model, output_shape, model_kwargs={'aligned_conditioning': mel_codes, 'conditioning_input': univnet_mel}) - gen_wav = self.local_modules['vocoder'](gen_mel) - return gen_wav, real_resampled, SAMPLE_RATE + gen_wav = self.local_modules['vocoder'].inference(gen_mel) + real_dec = self.local_modules['vocoder'].inference(univnet_mel) + return gen_wav.float(), real_dec, SAMPLE_RATE def load_projector(self): """ @@ -257,9 +258,9 @@ if __name__ == '__main__': if __name__ == '__main__': from utils.util import load_model_from_config - diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts9.yml', 'generator', + diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts9_mel.yml', 'generator', also_load_savepoint=False, - load_path='X:\\dlas\\experiments\\train_diffusion_tts9\\models\\7500_generator_ema.pth').cuda() + load_path='X:\\dlas\\experiments\\train_diffusion_tts9_mel\\models\\10000_generator_ema.pth').cuda() opt_eval = {'eval_tsv': 'Y:\\libritts\\test-clean\\transcribed-brief-w2v.tsv', 'diffusion_steps': 100, 'conditioning_free': False, 'conditioning_free_k': 1, 'diffusion_schedule': 'linear', 'diffusion_type': 'tts9_mel'}