diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py
index fda6380..6cdd496 100644
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@@ -82,21 +82,23 @@ def dynamic_range_decompression(x, C=1):
     return torch.exp(x) / C
 
 
-def get_voices():
-    subs = os.listdir('tortoise/voices')
+def get_voices(extra_voice_dirs=[]):
+    dirs = ['tortoise/voices'] + extra_voice_dirs
     voices = {}
-    for sub in subs:
-        subj = os.path.join('tortoise/voices', sub)
-        if os.path.isdir(subj):
-            voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth'))
+    for d in dirs:
+        subs = os.listdir(d)
+        for sub in subs:
+            subj = os.path.join(d, sub)
+            if os.path.isdir(subj):
+                voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth'))
     return voices
 
 
-def load_voice(voice):
+def load_voice(voice, extra_voice_dirs=[]):
     if voice == 'random':
         return None, None
 
-    voices = get_voices()
+    voices = get_voices(extra_voice_dirs)
     paths = voices[voice]
     if len(paths) == 1 and paths[0].endswith('.pth'):
         return None, torch.load(paths[0])
@@ -108,14 +110,14 @@ def load_voice(voice):
         return conds, None
 
 
-def load_voices(voices):
+def load_voices(voices, extra_voice_dirs=[]):
     latents = []
     clips = []
     for voice in voices:
         if voice == 'random':
             print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
             return None, None
-        clip, latent = load_voice(voice)
+        clip, latent = load_voice(voice, extra_voice_dirs)
         if latent is None:
             assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
             clips.extend(clip)