might help to resample to the right sample rate for the AR / dvae,,,
This commit is contained in:
parent
20789a0b8a
commit
80d6494973
|
@ -80,6 +80,8 @@ def format_diffusion_conditioning( sample, device, do_normalization=False ):
|
||||||
# encode a wav to conditioning latents + mel codes
|
# encode a wav to conditioning latents + mel codes
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
def encode(wav: Tensor, sr: int = cfg.sample_rate, device="cuda"):
|
def encode(wav: Tensor, sr: int = cfg.sample_rate, device="cuda"):
|
||||||
|
wav = torchaudio.functional.resample(wav, sr, 22050)
|
||||||
|
|
||||||
dvae = load_model("dvae", device=device)
|
dvae = load_model("dvae", device=device)
|
||||||
unified_voice = load_model("unified_voice", device=device)
|
unified_voice = load_model("unified_voice", device=device)
|
||||||
diffusion = load_model("diffusion", device=device)
|
diffusion = load_model("diffusion", device=device)
|
||||||
|
|
|
@ -4,7 +4,7 @@ from .config import cfg
|
||||||
from .data import create_train_val_dataloader
|
from .data import create_train_val_dataloader
|
||||||
from .emb import mel
|
from .emb import mel
|
||||||
|
|
||||||
from .utils import setup_logging, to_device, trainer, flatten_dict, do_gc
|
from .utils import setup_logging, to_device, trainer, flatten_dict, do_gc, wrapper as ml
|
||||||
from .utils.distributed import is_global_leader
|
from .utils.distributed import is_global_leader
|
||||||
|
|
||||||
import auraloss
|
import auraloss
|
||||||
|
@ -165,22 +165,24 @@ def run_eval(engines, eval_name, dl):
|
||||||
break
|
break
|
||||||
|
|
||||||
# diffusion pass
|
# diffusion pass
|
||||||
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
with ml.auto_unload(diffusion, enabled=True):
|
||||||
output_shape = (latents.shape[0], 100, output_seq_len)
|
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
||||||
precomputed_embeddings = diffusion.timestep_independent(latents, diffusion_latents, output_seq_len, False)
|
output_shape = (latents.shape[0], 100, output_seq_len)
|
||||||
|
precomputed_embeddings = diffusion.timestep_independent(latents, diffusion_latents, output_seq_len, False)
|
||||||
|
|
||||||
noise = torch.randn(output_shape, device=latents.device) * temperature
|
noise = torch.randn(output_shape, device=latents.device) * temperature
|
||||||
mel = diffuser.p_sample_loop(
|
mel = diffuser.p_sample_loop(
|
||||||
diffusion,
|
diffusion,
|
||||||
output_shape,
|
output_shape,
|
||||||
noise=noise,
|
noise=noise,
|
||||||
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
model_kwargs={'precomputed_aligned_embeddings': precomputed_embeddings},
|
||||||
progress=True
|
progress=True
|
||||||
)
|
)
|
||||||
mels = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
mels = denormalize_tacotron_mel(mel)[:,:,:output_seq_len]
|
||||||
|
|
||||||
# vocoder pass
|
# vocoder pass
|
||||||
wavs = vocoder.inference(mels)
|
with ml.auto_unload(vocoder, enabled=True):
|
||||||
|
wavs = vocoder.inference(mels)
|
||||||
|
|
||||||
return wavs
|
return wavs
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user