added storing the loaded model's hash to the TTS object instead of relying on jerryrig injecting it (although I still have to for the weirdos who refuse to update the right way), added a parameter when loading voices to load a latent tagged with a model's hash so latents are per-model now

2023-03-02 00:44:42 +00:00 · 2023-03-02 00:44:42 +00:00 · a9de016230
commit a9de016230
parent 7b839a4263
2 changed files with 43 additions and 21 deletions
--- a/tortoise/api.py
+++ b/tortoise/api.py
@ -42,6 +42,32 @@ MODELS = {
    'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth',
 }

+def hash_file(path, algo="md5", buffer_size=0):
+    import hashlib
+
+    hash = None
+    if algo == "md5":
+        hash = hashlib.md5()
+    elif algo == "sha1":
+        hash = hashlib.sha1()
+    else:
+        raise Exception(f'Unknown hash algorithm specified: {algo}')
+
+    if not os.path.exists(path):
+        raise Exception(f'Path not found: {path}')
+
+    with open(path, 'rb') as f:
+        if buffer_size > 0:
+            while True:
+                data = f.read(buffer_size)
+                if not data:
+                    break
+                hash.update(data)
+        else:
+            hash.update(f.read())
+
+    return "{0}".format(hash.hexdigest())
+
 def check_for_kill_signal():
    global STOP_SIGNAL
    if STOP_SIGNAL:
@ -221,16 +247,6 @@ class TextToSpeech:
        if device is None:
            device = get_device(verbose=True)

-        try:
-            import tortoise.utils.torch_intermediary as ml
-            if ml.OVERRIDE_ADAM:
-                print("Using BitsAndBytes ADAMW optimizations")
-            else:
-                print("NOT using BitsAndBytes ADAMW optimizations")
-        except Exception as e:
-            print(e)
-            pass
-
        self.input_sample_rate = input_sample_rate
        self.output_sample_rate = output_sample_rate
        self.minor_optimizations = minor_optimizations
@ -252,6 +268,7 @@ class TextToSpeech:
        self.tokenizer = VoiceBpeTokenizer()

        self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', models_dir)
+        self.autoregressive_model_hash = hash_file(self.autoregressive_model_path)

        if os.path.exists(f'{models_dir}/autoregressive.ptt'):
            # Assume this is a traced directory.
@ -295,6 +312,7 @@ class TextToSpeech:
    def load_autoregressive_model(self, autoregressive_model_path):
        previous_path = self.autoregressive_model_path
        self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir)
+        self.autoregressive_model_hash = hash_file(self.autoregressive_model_path)

        del self.autoregressive
        self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30,
@ -306,6 +324,7 @@ class TextToSpeech:
        if self.preloaded_tensors:
            self.autoregressive = self.autoregressive.to(self.device)

+
        return previous_path != self.autoregressive_model_path

    def load_cvvp(self):
--- a/tortoise/utils/audio.py
+++ b/tortoise/utils/audio.py
@ -91,25 +91,28 @@ def get_voices(extra_voice_dirs=[], load_latents=True):
    return voices


-def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu'):
+def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None):
    if voice == 'random':
        return None, None

    voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents)
-    paths = voices[voice]

+    paths = voices[voice]
    mtime = 0
-    voices = []
+    
    latent = None
-    for file in paths:
-        if file[-16:] == "cond_latents.pth":
-            latent = file
-        elif file[-4:] == ".pth":
-            {}
-            # noop
+    voices = []
+
+    for path in paths:
+        filename = os.path.basename(path)
+        if filename[-4:] == ".pth" and filename[:12] == "cond_latents":
+            if not model_hash and filename == "cond_latents.pth":
+                latent = path
+            elif model_hash and filename == f"cond_latents_{model_hash[:8]}.pth":
+                latent = path
        else:
-            voices.append(file)
-            mtime = max(mtime, os.path.getmtime(file))
+            voices.append(path)
+            mtime = max(mtime, os.path.getmtime(path))

    if load_latents and latent is not None:
        if os.path.getmtime(latent) > mtime: