From dc3d7b16676b912858dd085037a6fa113f3bab99 Mon Sep 17 00:00:00 2001
From: Danila Berezin <79604071+faad3@users.noreply.github.com>
Date: Tue, 17 May 2022 18:34:54 +0300
Subject: [PATCH] Fix bug in load_voices in audio.py

The read.py script did not work with pth latents, so I fix bug in audio.py. It seems that in the elif statement, instead of voice, voices should be clip, clips. And torch stack doesn't work with tuples, so I had to split this operation.
---
 tortoise/utils/audio.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py
index 2422e97..fda6380 100644
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@@ -119,14 +119,16 @@ def load_voices(voices):
         if latent is None:
             assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
             clips.extend(clip)
-        elif voice is None:
-            assert len(voices) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
+        elif clip is None:
+            assert len(clips) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
             latents.append(latent)
     if len(latents) == 0:
         return clips, None
     else:
-        latents = torch.stack(latents, dim=0)
-        return None, latents.mean(dim=0)
+        latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0)
+        latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0)
+        latents = (latents_0,latents_1)
+        return None, latents
 
 
 class TacotronSTFT(torch.nn.Module):
@@ -178,4 +180,4 @@ def wav_to_univnet_mel(wav, do_normalization=False):
     mel = stft.mel_spectrogram(wav)
     if do_normalization:
         mel = normalize_tacotron_mel(mel)
-    return mel
\ No newline at end of file
+    return mel