tweaks

2023-08-15 21:58:16 -05:00 · 2023-08-15 21:58:16 -05:00 · 1e3e1d9315
commit 1e3e1d9315
parent 277c759ab1
3 changed files with 49 additions and 22 deletions
--- a/README.md
+++ b/README.md
@ -126,6 +126,17 @@ Some additional flags you can pass are:
 * `--nar-temp`: sampling temperature to use for the NAR pass.
 * `--device`: device to use (default: `cuda`, examples: `cuda:0`, `cuda:1`, `cpu`)
 ## To-Do
 * properly pass in `modules` names to `weight_quantization` and `activation_quantization`.
 * fix `quit` hanging when using distributed training.
 * train and release a model.
 * extend to multiple languages (VALL-E X) and extend to SpeechX features.
 ## Notice
 - [EnCodec](https://github.com/facebookresearch/encodec) is licensed under CC-BY-NC 4.0. If you use the code to generate audio quantization or perform decoding, it is important to adhere to the terms of their license.
--- a/vall_e/config.py
+++ b/vall_e/config.py
@ -336,7 +336,10 @@ class DeepSpeed:
 				"offload_param": {
 					"device": "cpu",
 					"pin_memory": True
-				}
+				},
 				"zero_quantized_weights": self.use_compression_training,
 				"zero_hpz_partition_size": world_size(),
 				"zero_quantized_gradients": self.use_compression_training,
 			} if self.zero_optimization_level > 0 else None,
 			"comms_logger": {
 				"enabled": False
@ -439,21 +442,24 @@ class Config(_Config):
 		tmp = Config.from_yaml( config_path )
 		self.__dict__.update(tmp.__dict__)
 	def format( self ):
 		self.dataset = Dataset(**self.dataset)
 		self.models = Models(**self.models)
 		self.hyperparameters = Hyperparameters(**self.hyperparameters)
 		self.evaluation = Evaluation(**self.evaluation)
 		self.trainer = Trainer(**self.trainer)
 		self.inference = Inference(**self.inference)
 		self.bitsandbytes = BitsAndBytes(**self.bitsandbytes)
 		self.trainer.deepspeed = DeepSpeed(**self.trainer.deepspeed)
 cfg = Config.from_cli()
 # OmegaConf might not coerce the dicts into the @dataclass decorated classes, so we (try to) coerce them ourselves
 try:
-	cfg.dataset = Dataset(**cfg.dataset)
+	cfg.format()
 	cfg.models = Models(**cfg.models)
 	cfg.hyperparameters = Hyperparameters(**cfg.hyperparameters)
 	cfg.evaluation = Evaluation(**cfg.evaluation)
 	cfg.trainer = Trainer(**cfg.trainer)
 	cfg.inference = Inference(**cfg.inference)
 	cfg.bitsandbytes = BitsAndBytes(**cfg.bitsandbytes)
 	cfg.trainer.deepspeed = DeepSpeed(**cfg.trainer.deepspeed)
 	# cached_property stopped working...
 	if cfg.dataset.use_hdf5:
 		try:
--- a/vall_e/inference.py
+++ b/vall_e/inference.py
@ -8,10 +8,21 @@ from .emb import g2p, qnt
 from .utils import to_device
 from .config import cfg
 from .export import load_models
 from .models import get_models
 from .train import load_engines
 from .data import get_phone_symmap
 import random
 def trim( qnt, trim_length  ):
 	length = qnt.shape[0]
 	start = int(length * random.random())
 	end = start + trim_length
 	if end >= length:
 		start = length - trim_length
 		end = length
 	return qnt[start:end]
 class TTS():
 	def __init__( self, config=None, ar_ckpt=None, nar_ckpt=None, device="cuda" ):
 		self.loading = True 
@ -22,6 +33,8 @@ class TTS():
 		if config:
 			cfg.load_yaml( config )
 		cfg.format()
 		if ar_ckpt and nar_ckpt:
 			self.ar_ckpt = ar_ckpt
@ -45,18 +58,12 @@ class TTS():
 		self.loading = False 
 	def load_models( self ):
-		print("Loading models...")
+		engines = load_engines()
-		models = load_models()
+		for name, engine in engines.items():
 		print("Loaded models")
 		for name in models:
 			model = models[name]
 			if name[:2] == "ar":
-				self.ar = model.to(self.device, dtype=torch.float32)
+				self.ar = engine.module.to(self.device)
 				self.symmap = self.ar.phone_symmap
 			elif name[:3] == "nar":
-				self.nar = model.to(self.device, dtype=torch.float32)
+				self.nar = engine.module.to(self.device)
 			else:
 				print("Unknown:", name)
 	def encode_text( self, text, lang_marker="en" ):
 		text = g2p.encode(text)
@ -66,7 +73,10 @@ class TTS():
 	def encode_audio( self, path ):
 		enc = qnt.encode_from_file( path )
-		return enc[0].t().to(torch.int16)
+		res = enc[0].t().to(torch.int16)
 		if trim:
 			res = trim( res, int( 75 * cfg.dataset.duration_range[1] ) )
 		return res
 	def inference( self, text, reference, mode="both", max_ar_steps=6 * 75, ar_temp=1.0, nar_temp=1.0, out_path="./.tmp.wav" ):