forked from mrq/DL-Art-School
fixes
This commit is contained in:
parent
b754058018
commit
f563a8dd41
|
@ -10,7 +10,7 @@ from audio2numpy import open_audio
|
|||
from tqdm import tqdm
|
||||
|
||||
from data.util import find_files_of_type, is_audio_file, load_paths_from_cache
|
||||
from models.audio.tts.tacotron2 import load_wav_to_torch
|
||||
from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
|
||||
from utils.util import opt_get
|
||||
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@ import random
|
|||
import torch
|
||||
import torchaudio.sox_effects
|
||||
|
||||
from models.audio.tts.tacotron2 import load_wav_to_torch
|
||||
from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
|
||||
|
||||
|
||||
# Returns random double on [l,h] as a string
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
from models.audio.tts.tacotron2.taco_utils import *
|
||||
from models.audio.tts.tacotron2.text import *
|
||||
from models.audio.tts.tacotron2.tacotron2 import *
|
||||
from models.audio.tts.tacotron2.stft import *
|
||||
from models.audio.tts.tacotron2.layers import *
|
|
@ -84,19 +84,15 @@ class UnivNetGenerator(nn.Module):
|
|||
def inference(self, c, z=None):
|
||||
# pad input mel with zeros to cut artifact
|
||||
# see https://github.com/seungwonpark/melgan/issues/8
|
||||
zero = torch.full((1, self.mel_channel, 10), -11.5129).to(c.device)
|
||||
zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device)
|
||||
mel = torch.cat((c, zero), dim=2)
|
||||
|
||||
if z is None:
|
||||
z = torch.randn(1, self.noise_dim, mel.size(2)).to(mel.device)
|
||||
z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device)
|
||||
|
||||
audio = self.forward(mel, z)
|
||||
audio = audio.squeeze() # collapse all dimension except time axis
|
||||
audio = audio[:-(self.hop_length * 10)]
|
||||
audio = MAX_WAV_VALUE * audio
|
||||
audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1)
|
||||
audio = audio.short()
|
||||
|
||||
audio = audio[:, :, :-(self.hop_length * 10)]
|
||||
audio = audio.clamp(min=-1, max=1)
|
||||
return audio
|
||||
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ import torch.nn.functional as F
|
|||
|
||||
from data.util import is_wav_file, find_files_of_type
|
||||
from models.audio_resnet import resnet50
|
||||
from models.audio.tts.tacotron2 import load_wav_to_torch
|
||||
from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
|
||||
from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -125,20 +125,21 @@ class AudioDiffusionFid(evaluator.Evaluator):
|
|||
real_resampled = torchaudio.functional.resample(audio, 22050, SAMPLE_RATE).unsqueeze(0)
|
||||
univnet_mel = wav_to_univnet_mel(audio) # to be used for a conditioning input
|
||||
|
||||
output_size = real_resampled.shape[-1]
|
||||
output_size = univnet_mel.shape[-1]
|
||||
aligned_codes_compression_factor = output_size // mel_codes.shape[-1]
|
||||
padded_size = ceil_multiple(output_size, 2048)
|
||||
padded_size = ceil_multiple(output_size, self.model.alignment_size)
|
||||
padding_added = padded_size - output_size
|
||||
padding_needed_for_codes = padding_added // aligned_codes_compression_factor
|
||||
if padding_needed_for_codes > 0:
|
||||
mel_codes = F.pad(mel_codes, (0, padding_needed_for_codes))
|
||||
output_shape = (1, 1, padded_size)
|
||||
output_shape = (1, 100, padded_size)
|
||||
gen_mel = self.diffuser.p_sample_loop(self.model, output_shape,
|
||||
model_kwargs={'aligned_conditioning': mel_codes,
|
||||
'conditioning_input': univnet_mel})
|
||||
|
||||
gen_wav = self.local_modules['vocoder'](gen_mel)
|
||||
return gen_wav, real_resampled, SAMPLE_RATE
|
||||
gen_wav = self.local_modules['vocoder'].inference(gen_mel)
|
||||
real_dec = self.local_modules['vocoder'].inference(univnet_mel)
|
||||
return gen_wav.float(), real_dec, SAMPLE_RATE
|
||||
|
||||
def load_projector(self):
|
||||
"""
|
||||
|
@ -257,9 +258,9 @@ if __name__ == '__main__':
|
|||
if __name__ == '__main__':
|
||||
from utils.util import load_model_from_config
|
||||
|
||||
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts9.yml', 'generator',
|
||||
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts9_mel.yml', 'generator',
|
||||
also_load_savepoint=False,
|
||||
load_path='X:\\dlas\\experiments\\train_diffusion_tts9\\models\\7500_generator_ema.pth').cuda()
|
||||
load_path='X:\\dlas\\experiments\\train_diffusion_tts9_mel\\models\\10000_generator_ema.pth').cuda()
|
||||
opt_eval = {'eval_tsv': 'Y:\\libritts\\test-clean\\transcribed-brief-w2v.tsv', 'diffusion_steps': 100,
|
||||
'conditioning_free': False, 'conditioning_free_k': 1,
|
||||
'diffusion_schedule': 'linear', 'diffusion_type': 'tts9_mel'}
|
||||
|
|
Loading…
Reference in New Issue
Block a user