diff --git a/tortoise/api.py b/tortoise/api.py index 490be33..d43422c 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -42,6 +42,32 @@ MODELS = { 'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth', } +def hash_file(path, algo="md5", buffer_size=0): + import hashlib + + hash = None + if algo == "md5": + hash = hashlib.md5() + elif algo == "sha1": + hash = hashlib.sha1() + else: + raise Exception(f'Unknown hash algorithm specified: {algo}') + + if not os.path.exists(path): + raise Exception(f'Path not found: {path}') + + with open(path, 'rb') as f: + if buffer_size > 0: + while True: + data = f.read(buffer_size) + if not data: + break + hash.update(data) + else: + hash.update(f.read()) + + return "{0}".format(hash.hexdigest()) + def check_for_kill_signal(): global STOP_SIGNAL if STOP_SIGNAL: @@ -221,16 +247,6 @@ class TextToSpeech: if device is None: device = get_device(verbose=True) - try: - import tortoise.utils.torch_intermediary as ml - if ml.OVERRIDE_ADAM: - print("Using BitsAndBytes ADAMW optimizations") - else: - print("NOT using BitsAndBytes ADAMW optimizations") - except Exception as e: - print(e) - pass - self.input_sample_rate = input_sample_rate self.output_sample_rate = output_sample_rate self.minor_optimizations = minor_optimizations @@ -252,6 +268,7 @@ class TextToSpeech: self.tokenizer = VoiceBpeTokenizer() self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', models_dir) + self.autoregressive_model_hash = hash_file(self.autoregressive_model_path) if os.path.exists(f'{models_dir}/autoregressive.ptt'): # Assume this is a traced directory. @@ -295,6 +312,7 @@ class TextToSpeech: def load_autoregressive_model(self, autoregressive_model_path): previous_path = self.autoregressive_model_path self.autoregressive_model_path = autoregressive_model_path if autoregressive_model_path and os.path.exists(autoregressive_model_path) else get_model_path('autoregressive.pth', self.models_dir) + self.autoregressive_model_hash = hash_file(self.autoregressive_model_path) del self.autoregressive self.autoregressive = UnifiedVoice(max_mel_tokens=604, max_text_tokens=402, max_conditioning_inputs=2, layers=30, @@ -306,6 +324,7 @@ class TextToSpeech: if self.preloaded_tensors: self.autoregressive = self.autoregressive.to(self.device) + return previous_path != self.autoregressive_model_path def load_cvvp(self): diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index 8a645df..74060d4 100755 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -91,25 +91,28 @@ def get_voices(extra_voice_dirs=[], load_latents=True): return voices -def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu'): +def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu', model_hash=None): if voice == 'random': return None, None voices = get_voices(extra_voice_dirs=extra_voice_dirs, load_latents=load_latents) - paths = voices[voice] + paths = voices[voice] mtime = 0 - voices = [] + latent = None - for file in paths: - if file[-16:] == "cond_latents.pth": - latent = file - elif file[-4:] == ".pth": - {} - # noop + voices = [] + + for path in paths: + filename = os.path.basename(path) + if filename[-4:] == ".pth" and filename[:12] == "cond_latents": + if not model_hash and filename == "cond_latents.pth": + latent = path + elif model_hash and filename == f"cond_latents_{model_hash[:8]}.pth": + latent = path else: - voices.append(file) - mtime = max(mtime, os.path.getmtime(file)) + voices.append(path) + mtime = max(mtime, os.path.getmtime(path)) if load_latents and latent is not None: if os.path.getmtime(latent) > mtime: