forked from mrq/tortoise-tts
why didn't I also have it use chunks for computing the AR conditional latents (instead of just the diffusion aspect)
This commit is contained in:
parent
97cd58e7eb
commit
65a43deb9e
|
@ -4,10 +4,10 @@ transformers==4.19
|
||||||
tokenizers
|
tokenizers
|
||||||
inflect
|
inflect
|
||||||
progressbar
|
progressbar
|
||||||
einops==0.6.0
|
einops
|
||||||
unidecode
|
unidecode
|
||||||
scipy
|
scipy
|
||||||
librosa==0.8.0
|
librosa==0.8.1
|
||||||
torchaudio
|
torchaudio
|
||||||
threadpoolctl
|
threadpoolctl
|
||||||
appdirs
|
appdirs
|
||||||
|
|
|
@ -442,19 +442,7 @@ class TextToSpeech:
|
||||||
beta=8.555504641634386,
|
beta=8.555504641634386,
|
||||||
).to(device)
|
).to(device)
|
||||||
|
|
||||||
samples = []
|
samples = [resampler(sample) for sample in voice_samples]
|
||||||
auto_conds = []
|
|
||||||
for sample in voice_samples:
|
|
||||||
auto_conds.append(format_conditioning(sample, device=device, sampling_rate=self.input_sample_rate))
|
|
||||||
samples.append(resampler(sample))
|
|
||||||
|
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
|
||||||
|
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, device )
|
|
||||||
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
|
||||||
self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' )
|
|
||||||
|
|
||||||
diffusion_conds = []
|
|
||||||
chunks = []
|
chunks = []
|
||||||
|
|
||||||
concat = torch.cat(samples, dim=-1)
|
concat = torch.cat(samples, dim=-1)
|
||||||
|
@ -469,15 +457,22 @@ class TextToSpeech:
|
||||||
|
|
||||||
chunks = torch.chunk(concat, slices, dim=1)
|
chunks = torch.chunk(concat, slices, dim=1)
|
||||||
chunk_size = chunks[0].shape[-1]
|
chunk_size = chunks[0].shape[-1]
|
||||||
|
|
||||||
|
auto_conds = []
|
||||||
|
for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing AR conditioning latents..."):
|
||||||
|
auto_conds.append(format_conditioning(chunk, device=device, sampling_rate=self.input_sample_rate, cond_length=chunk_size))
|
||||||
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
|
|
||||||
|
self.autoregressive = migrate_to_device( self.autoregressive, device )
|
||||||
|
auto_latent = self.autoregressive.get_conditioning(auto_conds)
|
||||||
|
self.autoregressive = migrate_to_device( self.autoregressive, self.device if self.preloaded_tensors else 'cpu' )
|
||||||
|
|
||||||
# expand / truncate samples to match the common size
|
diffusion_conds = []
|
||||||
# required, as tensors need to be of the same length
|
for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing diffusion conditioning latents..."):
|
||||||
for chunk in tqdm_override(chunks, verbose=verbose, progress=progress, desc="Computing conditioning latents..."):
|
|
||||||
check_for_kill_signal()
|
check_for_kill_signal()
|
||||||
chunk = pad_or_truncate(chunk, chunk_size)
|
chunk = pad_or_truncate(chunk, chunk_size)
|
||||||
cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device)
|
cond_mel = wav_to_univnet_mel(migrate_to_device( chunk, device ), do_normalization=False, device=device)
|
||||||
diffusion_conds.append(cond_mel)
|
diffusion_conds.append(cond_mel)
|
||||||
|
|
||||||
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
diffusion_conds = torch.stack(diffusion_conds, dim=1)
|
||||||
|
|
||||||
self.diffusion = migrate_to_device( self.diffusion, device )
|
self.diffusion = migrate_to_device( self.diffusion, device )
|
||||||
|
|
Loading…
Reference in New Issue
Block a user