Fix bug in load_voices in audio.py

The read.py script did not work with pth latents, so I fix bug in audio.py. It seems that in the elif statement, instead of voice, voices should be clip, clips. And torch stack doesn't work with tuples, so I had to split this operation.
This commit is contained in:
Danila Berezin 2022-05-17 18:34:54 +03:00 committed by GitHub
parent e0329de2c2
commit ef5fb5f5fc

View File

@ -119,14 +119,16 @@ def load_voices(voices):
if latent is None: if latent is None:
assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
clips.extend(clip) clips.extend(clip)
elif voice is None: elif clip is None:
assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this." assert len(clips) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
latents.append(latent) latents.append(latent)
if len(latents) == 0: if len(latents) == 0:
return clips, None return clips, None
else: else:
latents = torch.stack(latents, dim=0) latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0)
return None, latents.mean(dim=0) latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0)
latents = (latents_0,latents_1)
return None, latents
class TacotronSTFT(torch.nn.Module): class TacotronSTFT(torch.nn.Module):