added automagic offloading models to GPU then CPU when theyre done during inference

2024-06-19 17:01:05 -05:00 · 2024-06-19 17:01:05 -05:00 · 73f271fb8a
commit 73f271fb8a
parent 5d24631bfb
7 changed files with 90 additions and 76 deletions
--- a/README.md
+++ b/README.md
@ -40,7 +40,7 @@ For training a LoRA, uncomment the `loras` block in your training YAML.
  - [ ] Reimplement redaction with the Wav2Vec2
 - [X] Implement training support (without DLAS)
  - [X] Feature parity with the VALL-E training setup with preparing a dataset ahead of time
- [ ] Automagic offloading to CPU for unused models (for training and inferencing)
+- [X] Automagic offloading to CPU for unused models (for training and inferencing)
 - [X] Automagic handling of the original weights into compatible weights
 - [ ] Reimplement added features from my original fork:
  - [ ] "Better" conditioning latents calculating
--- a/tortoise_tts/main.py
+++ b/tortoise_tts/main.py
@ -19,7 +19,7 @@ def main():
 	parser.add_argument("--top-k", type=int, default=16)
 	parser.add_argument("--repetition-penalty", type=float, default=1.0)
 	#parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
-	parser.add_argument("--length-penalty", type=float, default=0.0)
+	parser.add_argument("--length-penalty", type=float, default=1.0)
 	parser.add_argument("--beam-width", type=int, default=0)
 	parser.add_argument("--diffusion-sampler", type=str, default="ddim")
--- a/tortoise_tts/config.py
+++ b/tortoise_tts/config.py
@ -21,8 +21,6 @@ from .tokenizer import VoiceBpeTokenizer
 # Yuck
 from transformers import PreTrainedTokenizerFast
 from tokenizers import Tokenizer
@dataclass()
 class BaseConfig:
@ -472,17 +470,10 @@ class Inference:
 	weight_dtype: str = "float32"
 	amp: bool = False
 	auto_unload: bool = True
 	normalize: bool = False # do NOT enable this unless you know exactly what you're doing
 	# legacy / backwards compat
 	use_vocos: bool = True
 	use_encodec: bool = True
 	use_dac: bool = True
 	# shit that doesn't work
 	recurrent_chunk_size: int = 0
 	recurrent_forward: bool = False
 	@cached_property
 	def dtype(self):
 		if self.weight_dtype == "float16":
--- a/tortoise_tts/inference.py
+++ b/tortoise_tts/inference.py
@ -8,6 +8,7 @@ from pathlib import Path
 from .emb.mel import encode_from_files as encode_mel, trim, trim_random
 from .utils import to_device
 from .utils import wrapper as ml
 from .config import cfg
 from .models import get_models, load_model
@ -110,7 +111,7 @@ class TTS():
 		top_k=0,
 		repetition_penalty=1.0,
 		#repetition_penalty_decay=0.0,
-		length_penalty=0.0,
+		length_penalty=1.0,
 		beam_width=1,
 		#mirostat_tau=0,
 		#mirostat_eta=0.1,
@ -151,6 +152,13 @@ class TTS():
 		if vocoder is None:
 			vocoder = load_model("vocoder", device=cfg.device)
 		# shove everything to cpu
 		if cfg.inference.auto_unload:
 			autoregressive = autoregressive.to("cpu")
 			diffusion = diffusion.to("cpu")
 			clvp = clvp.to("cpu")
 			vocoder = vocoder.to("cpu")
 		wavs = []
 		# other vars
 		calm_token = 832
@ -168,6 +176,7 @@ class TTS():
 			text_lengths = torch.Tensor([ text.shape[0] ]).to(dtype=torch.int32)
 			with torch.autocast("cuda", dtype=cfg.inference.dtype, enabled=cfg.inference.amp):
 				with ml.auto_unload(autoregressive, enabled=cfg.inference.auto_unload):
 					# autoregressive pass
 					codes = autoregressive.inference_speech(
 						autoregressive_latents,
@ -224,6 +233,7 @@ class TTS():
 							break
 				# diffusion pass
 				with ml.auto_unload(diffusion, enabled=cfg.inference.auto_unload):
 					output_seq_len = latents.shape[1] * 4 * 24000 // 22050  # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
 					output_shape = (latents.shape[0], 100, output_seq_len)
 					precomputed_embeddings = diffusion.timestep_independent(latents, diffusion_latents, output_seq_len, False)
@ -240,6 +250,7 @@ class TTS():
 					mels = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
 				# vocoder pass
 				with ml.auto_unload(vocoder, enabled=cfg.inference.auto_unload):
 					waves = vocoder.inference(mels)
 				for wav in waves:
--- a/tortoise_tts/train.py
+++ b/tortoise_tts/train.py
@ -229,6 +229,10 @@ def run_eval(engines, eval_name, dl):
 	else:
 		_logger.info(f"Validation Metrics: {json.dumps(engines_stats)}.")
 	diffusion = diffusion.to("cpu")
 	clvp = clvp.to("cpu")
 	vocoder = vocoder.to("cpu")
 def train():
 	parser = argparse.ArgumentParser("TorToiSe TTS")
--- a/tortoise_tts/utils/wrapper.py
+++ b/tortoise_tts/utils/wrapper.py
@ -77,6 +77,14 @@ def autocasts(input, from_dtype, to_dtype):
 	else:
 		yield input
@contextmanager
 def auto_unload( model, gpu="cuda", cpu="cpu", enabled=True):
 	model.to(gpu)
 	yield model
 	if enabled:
 		model.to(cpu)
 # handles temporarily upcasting 'index tensors' so torch will stop bitching
 def autocast_forward( func ):
 	def wrapper( self, input, *args, **kwargs ):
--- a/tortoise_tts/webui.py
+++ b/tortoise_tts/webui.py
@ -240,7 +240,7 @@ with ui:
 				with gr.Row():
 					layout["inference"]["inputs"]["repetition-penalty"] = gr.Slider(value=1.0, minimum=-2.0, maximum=2.0, step=0.05, label="Repetition Penalty", info="Incurs a penalty to tokens based on how often they appear in a sequence.")
 					layout["inference"]["inputs"]["repetition-penalty-decay"] = gr.Slider(value=0.0, minimum=-2.0, maximum=2.0, step=0.05, label="Repetition Penalty Length Decay", info="Modifies the reptition penalty based on how far back in time the token appeared in the sequence.")
-					layout["inference"]["inputs"]["length-penalty"] = gr.Slider(value=0.0, minimum=-2.0, maximum=2.0, step=0.05, label="Length Penalty", info="(AR only) Modifies the probability of a stop token based on the current length of the sequence.")
+					layout["inference"]["inputs"]["length-penalty"] = gr.Slider(value=1.0, minimum=-2.0, maximum=2.0, step=0.05, label="Length Penalty", info="(AR only) Modifies the probability of a stop token based on the current length of the sequence.")
 				"""
 				with gr.Row():
 					layout["inference"]["inputs"]["mirostat-tau"] = gr.Slider(value=0.0, minimum=0.0, maximum=8.0, step=0.05, label="Mirostat τ (Tau)", info="The \"surprise\" value when performing mirostat sampling. 0 to disable.")