From ef5fb5f5fc3c4b54fd1956e1592dd937c1c10f34 Mon Sep 17 00:00:00 2001 From: Danila Berezin <79604071+faad3@users.noreply.github.com> Date: Tue, 17 May 2022 18:34:54 +0300 Subject: [PATCH] Fix bug in load_voices in audio.py The read.py script did not work with pth latents, so I fix bug in audio.py. It seems that in the elif statement, instead of voice, voices should be clip, clips. And torch stack doesn't work with tuples, so I had to split this operation. --- tortoise/utils/audio.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index 2422e97..fda6380 100644 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -119,14 +119,16 @@ def load_voices(voices): if latent is None: assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." clips.extend(clip) - elif voice is None: - assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." + elif clip is None: + assert len(clips) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." latents.append(latent) if len(latents) == 0: return clips, None else: - latents = torch.stack(latents, dim=0) - return None, latents.mean(dim=0) + latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0) + latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0) + latents = (latents_0,latents_1) + return None, latents class TacotronSTFT(torch.nn.Module): @@ -178,4 +180,4 @@ def wav_to_univnet_mel(wav, do_normalization=False): mel = stft.mel_spectrogram(wav) if do_normalization: mel = normalize_tacotron_mel(mel) - return mel \ No newline at end of file + return mel