vall-e/vall_e/inference.py

import torch
import torchaudio
import soundfile

from torch import Tensor
from einops import rearrange
from pathlib import Path

from .emb import g2p, qnt
from .emb.qnt import trim, trim_random
from .utils import to_device

from .config import cfg
from .models import get_models
from .engines import load_engines, deepspeed_available
from .data import get_phone_symmap, get_lang_symmap, _load_quants, _cleanup_phones

if deepspeed_available:
	import deepspeed

class TTS():
	def __init__( self, config=None, ar_ckpt=None, nar_ckpt=None, device=None, amp=None, dtype=None ):
		self.loading = True 
		
		self.input_sample_rate = 24000
		self.output_sample_rate = 24000

		if config:
			cfg.load_yaml( config )
			cfg.dataset.use_hdf5 = False # could use cfg.load_hdf5(), but why would it ever need to be loaded for inferencing

		try:
			cfg.format()
		except Exception as e:
			pass

		if amp is None:
			amp = cfg.inference.amp
		if dtype is None or dtype == "auto":
			dtype = cfg.inference.weight_dtype
		if device is None:
			device = cfg.device

		cfg.device = device
		cfg.mode = "inferencing"
		cfg.trainer.backend = cfg.inference.backend
		cfg.trainer.weight_dtype = dtype
		cfg.inference.weight_dtype = dtype

		self.device = device
		self.dtype = cfg.inference.dtype
		self.amp = amp

		self.symmap = None

		def parse( name, model, state ):
			if "userdata" in state and 'symmap' in state['userdata']:
				self.symmap = state['userdata']['symmap']
			elif "symmap" in state:
				self.symmap = state['symmap']

			if "module" in state:
				state = state['module']
			
			model.load_state_dict(state)

			if cfg.inference.backend == "local" and deepspeed_available and cfg.trainer.deepspeed.inferencing:
				model = deepspeed.init_inference(model=model, mp_size=1, replace_with_kernel_inject=True, dtype=dtype if not amp else torch.float32).module

			return model

		if ar_ckpt and nar_ckpt:
			self.ar_ckpt = ar_ckpt
			self.nar_ckpt = nar_ckpt

			models = get_models(cfg.models.get())

			for name, model in models.items():
				if name.startswith("ar"):
					state = torch.load(self.ar_ckpt)
					self.ar = parse( name, model, state )
				elif name.startswith("nar"):
					state = torch.load(self.nar_ckpt)
					self.nar = parse( name, model, state )
					
				if name.startswith("ar+nar"):
					self.nar = self.ar
		else:
			self.load_models()

		if self.dtype != torch.int8:
			self.ar = self.ar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)
			self.nar = self.nar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)

		self.ar.eval()
		self.nar.eval()

		if self.symmap is None:
			self.symmap = get_phone_symmap()

		self.loading = False 

	def load_models( self ):
		engines = load_engines()
		for name, engine in engines.items():
			if name.startswith("ar"):
				self.ar = engine.module
			elif name.startswith("nar"):
				self.nar = engine.module

			if name.startswith("ar+nar"):
				self.nar = self.ar

	def encode_text( self, text, language="en" ):
		# already a tensor, return it
		if isinstance( text, Tensor ):
			return text

		content = g2p.encode(text, language=language)
		content = _cleanup_phones( content )
		# ick
		try:
			phones = ["<s>"] + [ " " if not p else p for p in content ] + ["</s>"]
			return torch.tensor([*map(self.symmap.get, phones)])
		except Exception as e:
			pass
		phones = [ " " if not p else p for p in content ]
		return torch.tensor([ 1 ] + [*map(self.symmap.get, phones)] + [ 2 ])

	def encode_lang( self, language ):
		symmap = get_lang_symmap()
		id = 0
		if language in symmap:
			id = symmap[language]
		return torch.tensor([ id ])

	def encode_audio( self, paths, trim_length=0.0 ):
		# already a tensor, return it
		if isinstance( paths, Tensor ):
			return paths

		# split string into paths
		if isinstance( paths, str ):
			paths = [ Path(p) for p in paths.split(";") ]

		# merge inputs
		res = torch.cat([qnt.encode_from_file(path)[0][:, :].t().to(torch.int16) for path in paths])
		
		if trim_length:
			res = trim( res, int( 75 * trim_length ) )
		
		return res

	@torch.inference_mode()
	def inference(
		self,
		text,
		references,
		language="en",
		max_ar_steps=6 * 75,
		max_ar_context=-1,
		max_nar_levels=7,
		input_prompt_length=0.0,
		ar_temp=0.95,
		nar_temp=0.5,
		min_ar_temp=0.95,
		min_nar_temp=0.5,
		top_p=1.0,
		top_k=0,
		repetition_penalty=1.0,
		repetition_penalty_decay=0.0,
		length_penalty=0.0,
		beam_width=0,
		mirostat_tau=0,
		mirostat_eta=0.1,
		out_path=None
	):
		if out_path is None:
			out_path = f"./data/{cfg.start_time}.wav"

		prom = self.encode_audio( references, trim_length=input_prompt_length )
		phns = self.encode_text( text, language=language )
		lang = self.encode_lang( language )

		prom = to_device(prom, self.device).to(torch.int16)
		phns = to_device(phns, self.device).to(torch.uint8 if len(self.symmap) < 256 else torch.int16)
		lang = to_device(lang, self.device).to(torch.uint8)

		with torch.autocast("cuda", dtype=self.dtype, enabled=self.amp):
			resps_list = self.ar(
				text_list=[phns], proms_list=[prom], lang_list=[lang], max_steps=max_ar_steps, max_resp_context=max_ar_context,
				sampling_temperature=ar_temp,
				sampling_min_temperature=min_ar_temp,
				sampling_top_p=top_p, sampling_top_k=top_k,
				sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay,
				sampling_length_penalty=length_penalty,
				sampling_beam_width=beam_width,
				sampling_mirostat_tau=mirostat_tau,
				sampling_mirostat_eta=mirostat_eta,
			)
			resps_list = [r.unsqueeze(-1) for r in resps_list]
			resps_list = self.nar(
				text_list=[phns], proms_list=[prom], lang_list=[lang], resps_list=resps_list,
				max_levels=max_nar_levels,
				sampling_temperature=nar_temp,
				sampling_min_temperature=min_nar_temp,
				sampling_top_p=top_p, sampling_top_k=top_k,
				sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay,
			)

		wav, sr = qnt.decode_to_file(resps_list[0], out_path, device=self.device)
		
		return (wav, sr)
Rewrite init 2023-08-02 21:53:35 +00:00			`import torch`
			`import torchaudio`
			`import soundfile`

inferencing cleanup 2023-08-21 02:36:02 +00:00			`from torch import Tensor`
Rewrite init 2023-08-02 21:53:35 +00:00			`from einops import rearrange`
inferencing cleanup 2023-08-21 02:36:02 +00:00			`from pathlib import Path`
Rewrite init 2023-08-02 21:53:35 +00:00
			`from .emb import g2p, qnt`
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`from .emb.qnt import trim, trim_random`
Rewrite init 2023-08-02 21:53:35 +00:00			`from .utils import to_device`

			`from .config import cfg`
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`from .models import get_models`
cleanup, use deepspeed inferencing pathway if requested 2023-10-09 20:24:04 +00:00			`from .engines import load_engines, deepspeed_available`
exposed rolling resp context to the web UI, added passing in language to inferencing command line 2023-10-13 04:21:01 +00:00			`from .data import get_phone_symmap, get_lang_symmap, _load_quants, _cleanup_phones`
Rewrite init 2023-08-02 21:53:35 +00:00
cleanup, use deepspeed inferencing pathway if requested 2023-10-09 20:24:04 +00:00			`if deepspeed_available:`
reduced dynamic temperature threshold to > 1.0, as it seems to not quite be useful for audio LMs, sped up any sampling that touches logits by copying them to CPU first, as accessing tensors on the GPU is slow as balls) 2023-10-09 19:46:17 +00:00			`import deepspeed`

Rewrite init 2023-08-02 21:53:35 +00:00			`class TTS():`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`def __init__( self, config=None, ar_ckpt=None, nar_ckpt=None, device=None, amp=None, dtype=None ):`
Rewrite init 2023-08-02 21:53:35 +00:00			`self.loading = True`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00
Rewrite init 2023-08-02 21:53:35 +00:00			`self.input_sample_rate = 24000`
			`self.output_sample_rate = 24000`
made exporter make more sense 2023-08-14 03:56:28 +00:00
			`if config:`
			`cfg.load_yaml( config )`
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`cfg.dataset.use_hdf5 = False # could use cfg.load_hdf5(), but why would it ever need to be loaded for inferencing`
tweaks 2023-08-16 02:58:16 +00:00
I don't know if the lack of start/stop tokens being added was causing my inference tests to fail, but it seems better now 2023-08-21 00:21:54 +00:00			`try:`
			`cfg.format()`
			`except Exception as e:`
			`pass`
somewhat got recurrent forward working (it's as accurate as chunkwise forward: it's not accurate at all), added option to use AMP instead of blanket setting the weight's dtype 2023-09-02 01:58:29 +00:00
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`if amp is None:`
			`amp = cfg.inference.amp`
tweaks to try and get deepspeed quantized inferencing, validating bitsandbytes and deepspeed quantization, nothing seems to work 2023-10-13 03:21:43 +00:00			`if dtype is None or dtype == "auto":`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00			`dtype = cfg.inference.weight_dtype`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`if device is None:`
			`device = cfg.device`

			`cfg.device = device`
cleanup, use deepspeed inferencing pathway if requested 2023-10-09 20:24:04 +00:00			`cfg.mode = "inferencing"`
			`cfg.trainer.backend = cfg.inference.backend`
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`cfg.trainer.weight_dtype = dtype`
			`cfg.inference.weight_dtype = dtype`

			`self.device = device`
			`self.dtype = cfg.inference.dtype`
			`self.amp = amp`

tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`self.symmap = None`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00
			`def parse( name, model, state ):`
			`if "userdata" in state and 'symmap' in state['userdata']:`
			`self.symmap = state['userdata']['symmap']`
			`elif "symmap" in state:`
			`self.symmap = state['symmap']`

			`if "module" in state:`
			`state = state['module']`

			`model.load_state_dict(state)`
cleanup, use deepspeed inferencing pathway if requested 2023-10-09 20:24:04 +00:00
tweaks to try and get deepspeed quantized inferencing, validating bitsandbytes and deepspeed quantization, nothing seems to work 2023-10-13 03:21:43 +00:00			`if cfg.inference.backend == "local" and deepspeed_available and cfg.trainer.deepspeed.inferencing:`
cleanup, use deepspeed inferencing pathway if requested 2023-10-09 20:24:04 +00:00			`model = deepspeed.init_inference(model=model, mp_size=1, replace_with_kernel_inject=True, dtype=dtype if not amp else torch.float32).module`

restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00			`return model`

Rewrite init 2023-08-02 21:53:35 +00:00			`if ar_ckpt and nar_ckpt:`
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`self.ar_ckpt = ar_ckpt`
			`self.nar_ckpt = nar_ckpt`

			`models = get_models(cfg.models.get())`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`for name, model in models.items():`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00			`if name.startswith("ar"):`
seems that my PromEmbedding/RespEmbedding doesn't actually work all that well, naively using dedicated MultiEmbeddings for AR/NAR in the monolithic model is the best way to go 2023-09-08 06:03:24 +00:00			`state = torch.load(self.ar_ckpt)`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00			`self.ar = parse( name, model, state )`
distributed training works now (hopefully) 2023-08-14 03:07:45 +00:00			`elif name.startswith("nar"):`
removed the sampler as it's very misleading 2023-08-18 19:47:48 +00:00			`state = torch.load(self.nar_ckpt)`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00			`self.nar = parse( name, model, state )`

			`if name.startswith("ar+nar"):`
			`self.nar = self.ar`
Rewrite init 2023-08-02 21:53:35 +00:00			`else:`
made exporter make more sense 2023-08-14 03:56:28 +00:00			`self.load_models()`
Rewrite init 2023-08-02 21:53:35 +00:00
tweaks to try and get deepspeed quantized inferencing, validating bitsandbytes and deepspeed quantization, nothing seems to work 2023-10-13 03:21:43 +00:00			`if self.dtype != torch.int8:`
			`self.ar = self.ar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
			`self.nar = self.nar.to(self.device, dtype=self.dtype if not self.amp else torch.float32)`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00
cleanup, use deepspeed inferencing pathway if requested 2023-10-09 20:24:04 +00:00			`self.ar.eval()`
			`self.nar.eval()`
reduced dynamic temperature threshold to > 1.0, as it seems to not quite be useful for audio LMs, sped up any sampling that touches logits by copying them to CPU first, as accessing tensors on the GPU is slow as balls) 2023-10-09 19:46:17 +00:00
tweaks, including exporting on save/quit 2023-08-23 21:43:03 +00:00			`if self.symmap is None:`
			`self.symmap = get_phone_symmap()`

Rewrite init 2023-08-02 21:53:35 +00:00			`self.loading = False`

made exporter make more sense 2023-08-14 03:56:28 +00:00			`def load_models( self ):`
tweaks 2023-08-16 02:58:16 +00:00			`engines = load_engines()`
			`for name, engine in engines.items():`
restructured some things with the model to remove dead weights 2023-09-21 00:10:59 +00:00			`if name.startswith("ar"):`
			`self.ar = engine.module`
			`elif name.startswith("nar"):`
			`self.nar = engine.module`

			`if name.startswith("ar+nar"):`
seems that my PromEmbedding/RespEmbedding doesn't actually work all that well, naively using dedicated MultiEmbeddings for AR/NAR in the monolithic model is the best way to go 2023-09-08 06:03:24 +00:00			`self.nar = self.ar`
Rewrite init 2023-08-02 21:53:35 +00:00
inferencing cleanup 2023-08-21 02:36:02 +00:00			`def encode_text( self, text, language="en" ):`
			`# already a tensor, return it`
			`if isinstance( text, Tensor ):`
			`return text`

			`content = g2p.encode(text, language=language)`
apply phoneme cleanup in inferencing as well 2023-10-11 00:21:19 +00:00			`content = _cleanup_phones( content )`
I think I fixed a bug? 2023-08-25 04:33:36 +00:00			`# ick`
			`try:`
			`phones = ["<s>"] + [ " " if not p else p for p in content ] + ["</s>"]`
			`return torch.tensor([*map(self.symmap.get, phones)])`
			`except Exception as e:`
			`pass`
I don't know if the lack of start/stop tokens being added was causing my inference tests to fail, but it seems better now 2023-08-21 00:21:54 +00:00			`phones = [ " " if not p else p for p in content ]`
			`return torch.tensor([ 1 ] + [*map(self.symmap.get, phones)] + [ 2 ])`
Rewrite init 2023-08-02 21:53:35 +00:00
exposed rolling resp context to the web UI, added passing in language to inferencing command line 2023-10-13 04:21:01 +00:00			`def encode_lang( self, language ):`
			`symmap = get_lang_symmap()`
			`id = 0`
			`if language in symmap:`
			`id = symmap[language]`
			`return torch.tensor([ id ])`

added option to set the trim length for an input prompt 2023-09-09 23:04:44 +00:00			`def encode_audio( self, paths, trim_length=0.0 ):`
inferencing cleanup 2023-08-21 02:36:02 +00:00			`# already a tensor, return it`
			`if isinstance( paths, Tensor ):`
			`return paths`

			`# split string into paths`
			`if isinstance( paths, str ):`
			`paths = [ Path(p) for p in paths.split(";") ]`

			`# merge inputs`
I think I fixed a bug? 2023-08-25 04:33:36 +00:00			`res = torch.cat([qnt.encode_from_file(path)[0][:, :].t().to(torch.int16) for path in paths])`

added option to set the trim length for an input prompt 2023-09-09 23:04:44 +00:00			`if trim_length:`
			`res = trim( res, int( 75 * trim_length ) )`
inferencing cleanup 2023-08-21 02:36:02 +00:00
tweaks 2023-08-16 02:58:16 +00:00			`return res`
Rewrite init 2023-08-02 21:53:35 +00:00
inferencing cleanup 2023-08-21 02:36:02 +00:00			`@torch.inference_mode()`
implemented a naive beam search (I really should be taking a break) 2023-09-13 02:28:07 +00:00			`def inference(`
			`self,`
			`text,`
			`references,`
exposed rolling resp context to the web UI, added passing in language to inferencing command line 2023-10-13 04:21:01 +00:00			`language="en",`
implemented a naive beam search (I really should be taking a break) 2023-09-13 02:28:07 +00:00			`max_ar_steps=6 * 75,`
added initial support for languages (still testing, marked as model version 3), added experimental 'context extend by limiting the resp context' (untested) 2023-10-12 01:38:40 +00:00			`max_ar_context=-1,`
implemented a naive beam search (I really should be taking a break) 2023-09-13 02:28:07 +00:00			`max_nar_levels=7,`
			`input_prompt_length=0.0,`
			`ar_temp=0.95,`
			`nar_temp=0.5,`
changed dynamic temperature trigger to be a min-(n)ar-temp value between [0,(n)ar-temp), flags to set min temp, checkbox in web UI to request it 2023-10-10 22:02:33 +00:00			`min_ar_temp=0.95,`
			`min_nar_temp=0.5,`
implemented a naive beam search (I really should be taking a break) 2023-09-13 02:28:07 +00:00			`top_p=1.0,`
			`top_k=0,`
			`repetition_penalty=1.0,`
			`repetition_penalty_decay=0.0,`
			`length_penalty=0.0,`
			`beam_width=0,`
added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model) 2023-09-18 23:55:41 +00:00			`mirostat_tau=0,`
			`mirostat_eta=0.1,`
implemented a naive beam search (I really should be taking a break) 2023-09-13 02:28:07 +00:00			`out_path=None`
			`):`
inferencing cleanup 2023-08-21 02:36:02 +00:00			`if out_path is None:`
added lots of sampling options (top-k/top-p, repetition penalty, length penalty) 2023-09-09 01:30:54 +00:00			`out_path = f"./data/{cfg.start_time}.wav"`
Rewrite init 2023-08-02 21:53:35 +00:00
added option to set the trim length for an input prompt 2023-09-09 23:04:44 +00:00			`prom = self.encode_audio( references, trim_length=input_prompt_length )`
exposed rolling resp context to the web UI, added passing in language to inferencing command line 2023-10-13 04:21:01 +00:00			`phns = self.encode_text( text, language=language )`
			`lang = self.encode_lang( language )`
Rewrite init 2023-08-02 21:53:35 +00:00
			`prom = to_device(prom, self.device).to(torch.int16)`
			`phns = to_device(phns, self.device).to(torch.uint8 if len(self.symmap) < 256 else torch.int16)`
exposed rolling resp context to the web UI, added passing in language to inferencing command line 2023-10-13 04:21:01 +00:00			`lang = to_device(lang, self.device).to(torch.uint8)`
Rewrite init 2023-08-02 21:53:35 +00:00
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`with torch.autocast("cuda", dtype=self.dtype, enabled=self.amp):`
added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model) 2023-09-18 23:55:41 +00:00			`resps_list = self.ar(`
exposed rolling resp context to the web UI, added passing in language to inferencing command line 2023-10-13 04:21:01 +00:00			`text_list=[phns], proms_list=[prom], lang_list=[lang], max_steps=max_ar_steps, max_resp_context=max_ar_context,`
added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model) 2023-09-18 23:55:41 +00:00			`sampling_temperature=ar_temp,`
changed dynamic temperature trigger to be a min-(n)ar-temp value between [0,(n)ar-temp), flags to set min temp, checkbox in web UI to request it 2023-10-10 22:02:33 +00:00			`sampling_min_temperature=min_ar_temp,`
added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model) 2023-09-18 23:55:41 +00:00			`sampling_top_p=top_p, sampling_top_k=top_k,`
			`sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay,`
			`sampling_length_penalty=length_penalty,`
			`sampling_beam_width=beam_width,`
			`sampling_mirostat_tau=mirostat_tau,`
			`sampling_mirostat_eta=mirostat_eta,`
			`)`
somewhat got recurrent forward working (it's as accurate as chunkwise forward: it's not accurate at all), added option to use AMP instead of blanket setting the weight's dtype 2023-09-02 01:58:29 +00:00			`resps_list = [r.unsqueeze(-1) for r in resps_list]`
added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model) 2023-09-18 23:55:41 +00:00			`resps_list = self.nar(`
exposed rolling resp context to the web UI, added passing in language to inferencing command line 2023-10-13 04:21:01 +00:00			`text_list=[phns], proms_list=[prom], lang_list=[lang], resps_list=resps_list,`
added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model) 2023-09-18 23:55:41 +00:00			`max_levels=max_nar_levels,`
			`sampling_temperature=nar_temp,`
changed dynamic temperature trigger to be a min-(n)ar-temp value between [0,(n)ar-temp), flags to set min temp, checkbox in web UI to request it 2023-10-10 22:02:33 +00:00			`sampling_min_temperature=min_nar_temp,`
added mirostat sampling (given a partially trained model, it got far decent output than I expected, need to test on a better trained model) 2023-09-18 23:55:41 +00:00			`sampling_top_p=top_p, sampling_top_k=top_k,`
			`sampling_repetition_penalty=repetition_penalty, sampling_repetition_penalty_decay=repetition_penalty_decay,`
			`)`
Rewrite init 2023-08-02 21:53:35 +00:00
added light web UI (need to port the telemetry disabling bandaids from aivc) 2023-09-09 21:17:20 +00:00			`wav, sr = qnt.decode_to_file(resps_list[0], out_path, device=self.device)`
Rewrite init 2023-08-02 21:53:35 +00:00
			`return (wav, sr)`