vall-e/vall_e/inference.py

import torch
import torchaudio
import soundfile

from torch import Tensor
from einops import rearrange
from pathlib import Path

from .emb import g2p, qnt
from .emb.qnt import trim, trim_random
from .utils import to_device

from .config import cfg
from .models import get_models
from .train import load_engines
from .data import get_phone_symmap, _load_quants

class TTS():
	def __init__( self, config=None, ar_ckpt=None, nar_ckpt=None, device=None, amp=None, dtype=None ):
		self.loading = True 
		
		self.input_sample_rate = 24000
		self.output_sample_rate = 24000

		if config:
			cfg.load_yaml( config )
			cfg.dataset.use_hdf5 = False # could use cfg.load_hdf5(), but why would it ever need to be loaded for inferencing

		try:
			cfg.format()
		except Exception as e:
			pass

		if amp is None:
			amp = cfg.inference.amp
		if dtype is None:
			dtype = cfg.inference.dtype
		if device is None:
			device = cfg.device

		cfg.mode = "inferencing"
		cfg.device = device
		cfg.trainer.load_state_dict = True
		cfg.trainer.backend = "local"
		cfg.trainer.weight_dtype = dtype
		cfg.inference.weight_dtype = dtype

		self.device = device
		self.dtype = cfg.inference.dtype
		self.amp = amp

		self.symmap = None
		if ar_ckpt and nar_ckpt:
			self.ar_ckpt = ar_ckpt
			self.nar_ckpt = nar_ckpt

			models = get_models(cfg.models.get())
			for name, model in models.items():
				if name.startswith("ar+nar"):
					self.ar = model
					state = torch.load(self.ar_ckpt)
					if "symmap" in state:
						self.symmap = state['symmap']
					if "module" in state:
						state = state['module']
					self.ar.load_state_dict(state)
					self.ar = self.ar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)
					self.nar = self.ar
				elif name.startswith("ar"):
					self.ar = model
					state = torch.load(self.ar_ckpt)
					if "symmap" in state:
						self.symmap = state['symmap']
					if "module" in state:
						state = state['module']
					self.ar.load_state_dict(state)
					self.ar = self.ar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)
				elif name.startswith("nar"):
					self.nar = model
					state = torch.load(self.nar_ckpt)
					if "symmap" in state:
						self.symmap = state['symmap']
					if "module" in state:
						state = state['module']
					self.nar.load_state_dict(state)
					self.nar = self.nar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)
		else:
			self.load_models()

		if self.symmap is None:
			self.symmap = get_phone_symmap()

		self.ar.eval()
		self.nar.eval()

		self.loading = False 

	def load_models( self ):
		engines = load_engines()
		for name, engine in engines.items():
			if name[:6] == "ar+nar":
				self.ar = engine.module.to(self.device, dtype=self.dtype if not self.amp else torch.float32)
				self.nar = self.ar
			elif name[:2] == "ar":
				self.ar = engine.module.to(self.device, dtype=self.dtype if not self.amp else torch.float32)
			elif name[:3] == "nar":
				self.nar = engine.module.to(self.device, dtype=self.dtype if not self.amp else torch.float32)

	def encode_text( self, text, language="en" ):
		# already a tensor, return it
		if isinstance( text, Tensor ):
			return text

		content = g2p.encode(text, language=language)
		# ick
		try:
			phones = ["<s>"] + [ " " if not p else p for p in content ] + ["</s>"]
			return torch.tensor([*map(self.symmap.get, phones)])
		except Exception as e:
			pass
		phones = [ " " if not p else p for p in content ]
		return torch.tensor([ 1 ] + [*map(self.symmap.get, phones)] + [ 2 ])

	def encode_audio( self, paths, trim_length=0.0 ):
		# already a tensor, return it
		if isinstance( paths, Tensor ):
			return paths

		# split string into paths
		if isinstance( paths, str ):
			paths = [ Path(p) for p in paths.split(";") ]

		# merge inputs
		res = torch.cat([qnt.encode_from_file(path)[0][:, :].t().to(torch.int16) for path in paths])
		
		if trim_length:
			res = trim( res, int( 75 * trim_length ) )
		
		return res

	@torch.inference_mode()
	def inference( self, text, references, max_ar_steps=6 * 75, max_nar_levels=7, input_prompt_length=0.0, ar_temp=0.95, nar_temp=0.5, top_p=1.0, top_k=0, repetition_penalty=1.0, repetition_penalty_decay=0.0, length_penalty=0.0, out_path=None ):
		if out_path is None:
			out_path = f"./data/{cfg.start_time}.wav"

		prom = self.encode_audio( references, trim_length=input_prompt_length )
		phns = self.encode_text( text )

		prom = to_device(prom, self.device).to(torch.int16)
		phns = to_device(phns, self.device).to(torch.uint8 if len(self.symmap) < 256 else torch.int16)

		with torch.autocast("cuda", dtype=self.dtype, enabled=self.amp):
			resps_list = self.ar(text_list=[phns], proms_list=[prom], max_steps=max_ar_steps, sampling_temperature=ar_temp, sampling_top_p=top_p, sampling_top_k=top_k, sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay, sampling_length_penalty=length_penalty)
			resps_list = [r.unsqueeze(-1) for r in resps_list]
			resps_list = self.nar(text_list=[phns], proms_list=[prom], resps_list=resps_list, max_levels=max_nar_levels, sampling_temperature=nar_temp, sampling_top_p=top_p, sampling_top_k=top_k, sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay, sampling_length_penalty=length_penalty)

		wav, sr = qnt.decode_to_file(resps_list[0], out_path, device=self.device)
		
		return (wav, sr)
Rewrite init 2023-08-02 21:53:35 +00:00			`import torch`
			`import torchaudio`
			`import soundfile`

inferencing cleanup 2023-08-21 02:36:02 +00:00			`from torch import Tensor`
Rewrite init 2023-08-02 21:53:35 +00:00			`from einops import rearrange`
inferencing cleanup 2023-08-21 02:36:02 +00:00			`from pathlib import Path`
Rewrite init 2023-08-02 21:53:35 +00:00
			`from .emb import g2p, qnt`
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`from .emb.qnt import trim, trim_random`
Rewrite init 2023-08-02 21:53:35 +00:00			`from .utils import to_device`

			`from .config import cfg`
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`from .models import get_models`
tweaks 2023-08-16 02:58:16 +00:00			`from .train import load_engines`
I think I fixed a bug? 2023-08-25 04:33:36 +00:00			`from .data import get_phone_symmap, _load_quants`
Rewrite init 2023-08-02 21:53:35 +00:00
			`class TTS():`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`def __init__( self, config=None, ar_ckpt=None, nar_ckpt=None, device=None, amp=None, dtype=None ):`
Rewrite init 2023-08-02 21:53:35 +00:00			`self.loading = True`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00
Rewrite init 2023-08-02 21:53:35 +00:00			`self.input_sample_rate = 24000`
			`self.output_sample_rate = 24000`
made exporter make more sense 2023-08-14 03:56:28 +00:00
			`if config:`
			`cfg.load_yaml( config )`
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`cfg.dataset.use_hdf5 = False # could use cfg.load_hdf5(), but why would it ever need to be loaded for inferencing`
tweaks 2023-08-16 02:58:16 +00:00
I don't know if the lack of start/stop tokens being added was causing my inference tests to fail, but it seems better now 2023-08-21 00:21:54 +00:00			`try:`
			`cfg.format()`
			`except Exception as e:`
			`pass`
somewhat got recurrent forward working (it's as accurate as chunkwise forward: it's not accurate at all), added option to use AMP instead of blanket setting the weight's dtype 2023-09-02 01:58:29 +00:00
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`if amp is None:`
			`amp = cfg.inference.amp`
			`if dtype is None:`
			`dtype = cfg.inference.dtype`
			`if device is None:`
			`device = cfg.device`

somewhat got recurrent forward working (it's as accurate as chunkwise forward: it's not accurate at all), added option to use AMP instead of blanket setting the weight's dtype 2023-09-02 01:58:29 +00:00			`cfg.mode = "inferencing"`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`cfg.device = device`
			`cfg.trainer.load_state_dict = True`
			`cfg.trainer.backend = "local"`
			`cfg.trainer.weight_dtype = dtype`
			`cfg.inference.weight_dtype = dtype`

			`self.device = device`
			`self.dtype = cfg.inference.dtype`
			`self.amp = amp`

tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`self.symmap = None`
Rewrite init 2023-08-02 21:53:35 +00:00			`if ar_ckpt and nar_ckpt:`
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`self.ar_ckpt = ar_ckpt`
			`self.nar_ckpt = nar_ckpt`

			`models = get_models(cfg.models.get())`
			`for name, model in models.items():`
seems that my PromEmbedding/RespEmbedding doesn't actually work all that well, naively using dedicated MultiEmbeddings for AR/NAR in the monolithic model is the best way to go 2023-09-08 06:03:24 +00:00			`if name.startswith("ar+nar"):`
			`self.ar = model`
			`state = torch.load(self.ar_ckpt)`
			`if "symmap" in state:`
			`self.symmap = state['symmap']`
			`if "module" in state:`
			`state = state['module']`
			`self.ar.load_state_dict(state)`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`self.ar = self.ar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
seems that my PromEmbedding/RespEmbedding doesn't actually work all that well, naively using dedicated MultiEmbeddings for AR/NAR in the monolithic model is the best way to go 2023-09-08 06:03:24 +00:00			`self.nar = self.ar`
			`elif name.startswith("ar"):`
Forgot to re-add in setting the weight's dtype on model load 2023-08-23 03:57:23 +00:00			`self.ar = model`
removed the sampler as it's very misleading 2023-08-18 19:47:48 +00:00			`state = torch.load(self.ar_ckpt)`
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`if "symmap" in state:`
			`self.symmap = state['symmap']`
removed the sampler as it's very misleading 2023-08-18 19:47:48 +00:00			`if "module" in state:`
			`state = state['module']`
			`self.ar.load_state_dict(state)`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`self.ar = self.ar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`elif name.startswith("nar"):`
Forgot to re-add in setting the weight's dtype on model load 2023-08-23 03:57:23 +00:00			`self.nar = model`
removed the sampler as it's very misleading 2023-08-18 19:47:48 +00:00			`state = torch.load(self.nar_ckpt)`
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`if "symmap" in state:`
			`self.symmap = state['symmap']`
removed the sampler as it's very misleading 2023-08-18 19:47:48 +00:00			`if "module" in state:`
			`state = state['module']`
			`self.nar.load_state_dict(state)`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`self.nar = self.nar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
Rewrite init 2023-08-02 21:53:35 +00:00			`else:`
made exporter make more sense 2023-08-14 03:56:28 +00:00			`self.load_models()`
Rewrite init 2023-08-02 21:53:35 +00:00
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`if self.symmap is None:`
			`self.symmap = get_phone_symmap()`

set model to eval when inferencing (very important) 2023-08-05 04:29:05 +00:00			`self.ar.eval()`
			`self.nar.eval()`

Rewrite init 2023-08-02 21:53:35 +00:00			`self.loading = False`

made exporter make more sense 2023-08-14 03:56:28 +00:00			`def load_models( self ):`
tweaks 2023-08-16 02:58:16 +00:00			`engines = load_engines()`
			`for name, engine in engines.items():`
seems that my PromEmbedding/RespEmbedding doesn't actually work all that well, naively using dedicated MultiEmbeddings for AR/NAR in the monolithic model is the best way to go 2023-09-08 06:03:24 +00:00			`if name[:6] == "ar+nar":`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`self.ar = engine.module.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
seems that my PromEmbedding/RespEmbedding doesn't actually work all that well, naively using dedicated MultiEmbeddings for AR/NAR in the monolithic model is the best way to go 2023-09-08 06:03:24 +00:00			`self.nar = self.ar`
			`elif name[:2] == "ar":`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`self.ar = engine.module.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
Rewrite init 2023-08-02 21:53:35 +00:00			`elif name[:3] == "nar":`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`self.nar = engine.module.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
Rewrite init 2023-08-02 21:53:35 +00:00
inferencing cleanup 2023-08-21 02:36:02 +00:00			`def encode_text( self, text, language="en" ):`
			`# already a tensor, return it`
			`if isinstance( text, Tensor ):`
			`return text`

			`content = g2p.encode(text, language=language)`
I think I fixed a bug? 2023-08-25 04:33:36 +00:00			`# ick`
			`try:`
			`phones = ["<s>"] + [ " " if not p else p for p in content ] + ["</s>"]`
			`return torch.tensor([*map(self.symmap.get, phones)])`
			`except Exception as e:`
			`pass`
I don't know if the lack of start/stop tokens being added was causing my inference tests to fail, but it seems better now 2023-08-21 00:21:54 +00:00			`phones = [ " " if not p else p for p in content ]`
			`return torch.tensor([ 1 ] + [*map(self.symmap.get, phones)] + [ 2 ])`
Rewrite init 2023-08-02 21:53:35 +00:00
added option to set the trim length for an input prompt 2023-09-09 23:04:44 +00:00			`def encode_audio( self, paths, trim_length=0.0 ):`
inferencing cleanup 2023-08-21 02:36:02 +00:00			`# already a tensor, return it`
			`if isinstance( paths, Tensor ):`
			`return paths`

			`# split string into paths`
			`if isinstance( paths, str ):`
			`paths = [ Path(p) for p in paths.split(";") ]`

			`# merge inputs`
I think I fixed a bug? 2023-08-25 04:33:36 +00:00			`res = torch.cat([qnt.encode_from_file(path)[0][:, :].t().to(torch.int16) for path in paths])`

added option to set the trim length for an input prompt 2023-09-09 23:04:44 +00:00			`if trim_length:`
			`res = trim( res, int( 75 * trim_length ) )`
inferencing cleanup 2023-08-21 02:36:02 +00:00
tweaks 2023-08-16 02:58:16 +00:00			`return res`
Rewrite init 2023-08-02 21:53:35 +00:00
inferencing cleanup 2023-08-21 02:36:02 +00:00			`@torch.inference_mode()`
added option to limit (or exceed) inferenced RVQ-bin levels through the NAR 2023-09-10 18:50:13 +00:00			`def inference( self, text, references, max_ar_steps=6 * 75, max_nar_levels=7, input_prompt_length=0.0, ar_temp=0.95, nar_temp=0.5, top_p=1.0, top_k=0, repetition_penalty=1.0, repetition_penalty_decay=0.0, length_penalty=0.0, out_path=None ):`
inferencing cleanup 2023-08-21 02:36:02 +00:00			`if out_path is None:`
added lots of sampling options (top-k/top-p, repetition penalty, length penalty) 2023-09-09 01:30:54 +00:00			`out_path = f"./data/{cfg.start_time}.wav"`
Rewrite init 2023-08-02 21:53:35 +00:00
added option to set the trim length for an input prompt 2023-09-09 23:04:44 +00:00			`prom = self.encode_audio( references, trim_length=input_prompt_length )`
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`phns = self.encode_text( text )`
Rewrite init 2023-08-02 21:53:35 +00:00
			`prom = to_device(prom, self.device).to(torch.int16)`
			`phns = to_device(phns, self.device).to(torch.uint8 if len(self.symmap) < 256 else torch.int16)`

added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`with torch.autocast("cuda", dtype=self.dtype, enabled=self.amp):`
added a length-based decay factor for repetition penalty 2023-09-09 02:02:00 +00:00			`resps_list = self.ar(text_list=[phns], proms_list=[prom], max_steps=max_ar_steps, sampling_temperature=ar_temp, sampling_top_p=top_p, sampling_top_k=top_k, sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay, sampling_length_penalty=length_penalty)`
somewhat got recurrent forward working (it's as accurate as chunkwise forward: it's not accurate at all), added option to use AMP instead of blanket setting the weight's dtype 2023-09-02 01:58:29 +00:00			`resps_list = [r.unsqueeze(-1) for r in resps_list]`
added option to limit (or exceed) inferenced RVQ-bin levels through the NAR 2023-09-10 18:50:13 +00:00			`resps_list = self.nar(text_list=[phns], proms_list=[prom], resps_list=resps_list, max_levels=max_nar_levels, sampling_temperature=nar_temp, sampling_top_p=top_p, sampling_top_k=top_k, sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay, sampling_length_penalty=length_penalty)`
Rewrite init 2023-08-02 21:53:35 +00:00
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`wav, sr = qnt.decode_to_file(resps_list[0], out_path, device=self.device)`
Rewrite init 2023-08-02 21:53:35 +00:00
			`return (wav, sr)`