diff --git a/tortoise/api.py b/tortoise/api.py index 7ff3621..115cea6 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -42,12 +42,15 @@ MODELS = { 'rlg_diffuser.pth': 'https://huggingface.co/jbetker/tortoise-tts-v2/resolve/main/.models/rlg_diffuser.pth', } -def tqdm_override(arr, verbose=False, progress=None, desc=None): +def check_for_kill_signal(): global STOP_SIGNAL if STOP_SIGNAL: STOP_SIGNAL = False raise Exception("Kill signal detected") +def tqdm_override(arr, verbose=False, progress=None, desc=None): + check_for_kill_signal() + if verbose and desc is not None: print(desc) @@ -368,6 +371,7 @@ class TextToSpeech: # expand / truncate samples to match the common size # required, as tensors need to be of the same length for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."): + check_for_kill_signal() chunk = pad_or_truncate(chunk, chunk_size) cond_mel = wav_to_univnet_mel(chunk.to(device), do_normalization=False, device=device) diffusion_conds.append(cond_mel) @@ -524,6 +528,7 @@ class TextToSpeech: with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"): + check_for_kill_signal() codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, do_sample=True, top_p=top_p, @@ -565,6 +570,7 @@ class TextToSpeech: desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%" for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc): + check_for_kill_signal() for i in range(batch.shape[0]): batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)