might help to resample to the right sample rate for the AR / dvae,,,
This commit is contained in:
parent
20789a0b8a
commit
80d6494973
|
@ -80,6 +80,8 @@ def format_diffusion_conditioning( sample, device, do_normalization=False ):
|
|||
# encode a wav to conditioning latents + mel codes
|
||||
@torch.inference_mode()
|
||||
def encode(wav: Tensor, sr: int = cfg.sample_rate, device="cuda"):
|
||||
wav = torchaudio.functional.resample(wav, sr, 22050)
|
||||
|
||||
dvae = load_model("dvae", device=device)
|
||||
unified_voice = load_model("unified_voice", device=device)
|
||||
diffusion = load_model("diffusion", device=device)
|
||||
|
|
|
@ -4,7 +4,7 @@ from .config import cfg
|
|||
from .data import create_train_val_dataloader
|
||||
from .emb import mel
|
||||
|
||||
from .utils import setup_logging, to_device, trainer, flatten_dict, do_gc
|
||||
from .utils import setup_logging, to_device, trainer, flatten_dict, do_gc, wrapper as ml
|
||||
from .utils.distributed import is_global_leader
|
||||
|
||||
import auraloss
|
||||
|
@ -165,22 +165,24 @@ def run_eval(engines, eval_name, dl):
|
|||
break
|
||||
|
||||
# diffusion pass
|
||||
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
||||
output_shape = (latents.shape[0], 100, output_seq_len)
|
||||
precomputed_embeddings = diffusion.timestep_independent(latents, diffusion_latents, output_seq_len, False)
|
||||
with ml.auto_unload(diffusion, enabled=True):
|
||||
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
||||
output_shape = (latents.shape[0], 100, output_seq_len)
|
||||
precomputed_embeddings = diffusion.timestep_independent(latents, diffusion_latents, output_seq_len, False)
|
||||
|
||||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||
mel = diffuser.p_sample_loop(
|
||||
diffusion,
|
||||
output_shape,
|
||||
noise=noise,
|
||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||
progress=True
|
||||
)
|
||||
mels = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||
mel = diffuser.p_sample_loop(
|
||||
diffusion,
|
||||
output_shape,
|
||||
noise=noise,
|
||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||
progress=True
|
||||
)
|
||||
mels = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||
|
||||
# vocoder pass
|
||||
wavs = vocoder.inference(mels)
|
||||
with ml.auto_unload(vocoder, enabled=True):
|
||||
wavs = vocoder.inference(mels)
|
||||
|
||||
return wavs
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user