forked from mrq/DL-Art-School
fixes
This commit is contained in:
parent
b754058018
commit
f563a8dd41
|
@ -10,7 +10,7 @@ from audio2numpy import open_audio
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from data.util import find_files_of_type, is_audio_file, load_paths_from_cache
|
from data.util import find_files_of_type, is_audio_file, load_paths_from_cache
|
||||||
from models.audio.tts.tacotron2 import load_wav_to_torch
|
from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
|
||||||
from utils.util import opt_get
|
from utils.util import opt_get
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ import random
|
||||||
import torch
|
import torch
|
||||||
import torchaudio.sox_effects
|
import torchaudio.sox_effects
|
||||||
|
|
||||||
from models.audio.tts.tacotron2 import load_wav_to_torch
|
from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
|
||||||
|
|
||||||
|
|
||||||
# Returns random double on [l,h] as a string
|
# Returns random double on [l,h] as a string
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
from models.audio.tts.tacotron2.taco_utils import *
|
||||||
|
from models.audio.tts.tacotron2.text import *
|
||||||
|
from models.audio.tts.tacotron2.tacotron2 import *
|
||||||
|
from models.audio.tts.tacotron2.stft import *
|
||||||
|
from models.audio.tts.tacotron2.layers import *
|
|
@ -84,19 +84,15 @@ class UnivNetGenerator(nn.Module):
|
||||||
def inference(self, c, z=None):
|
def inference(self, c, z=None):
|
||||||
# pad input mel with zeros to cut artifact
|
# pad input mel with zeros to cut artifact
|
||||||
# see https://github.com/seungwonpark/melgan/issues/8
|
# see https://github.com/seungwonpark/melgan/issues/8
|
||||||
zero = torch.full((1, self.mel_channel, 10), -11.5129).to(c.device)
|
zero = torch.full((c.shape[0], self.mel_channel, 10), -11.5129).to(c.device)
|
||||||
mel = torch.cat((c, zero), dim=2)
|
mel = torch.cat((c, zero), dim=2)
|
||||||
|
|
||||||
if z is None:
|
if z is None:
|
||||||
z = torch.randn(1, self.noise_dim, mel.size(2)).to(mel.device)
|
z = torch.randn(c.shape[0], self.noise_dim, mel.size(2)).to(mel.device)
|
||||||
|
|
||||||
audio = self.forward(mel, z)
|
audio = self.forward(mel, z)
|
||||||
audio = audio.squeeze() # collapse all dimension except time axis
|
audio = audio[:, :, :-(self.hop_length * 10)]
|
||||||
audio = audio[:-(self.hop_length * 10)]
|
audio = audio.clamp(min=-1, max=1)
|
||||||
audio = MAX_WAV_VALUE * audio
|
|
||||||
audio = audio.clamp(min=-MAX_WAV_VALUE, max=MAX_WAV_VALUE - 1)
|
|
||||||
audio = audio.short()
|
|
||||||
|
|
||||||
return audio
|
return audio
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@ import torch.nn.functional as F
|
||||||
|
|
||||||
from data.util import is_wav_file, find_files_of_type
|
from data.util import is_wav_file, find_files_of_type
|
||||||
from models.audio_resnet import resnet50
|
from models.audio_resnet import resnet50
|
||||||
from models.audio.tts.tacotron2 import load_wav_to_torch
|
from models.audio.tts.tacotron2.taco_utils import load_wav_to_torch
|
||||||
from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
|
from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -125,20 +125,21 @@ class AudioDiffusionFid(evaluator.Evaluator):
|
||||||
real_resampled = torchaudio.functional.resample(audio, 22050, SAMPLE_RATE).unsqueeze(0)
|
real_resampled = torchaudio.functional.resample(audio, 22050, SAMPLE_RATE).unsqueeze(0)
|
||||||
univnet_mel = wav_to_univnet_mel(audio) # to be used for a conditioning input
|
univnet_mel = wav_to_univnet_mel(audio) # to be used for a conditioning input
|
||||||
|
|
||||||
output_size = real_resampled.shape[-1]
|
output_size = univnet_mel.shape[-1]
|
||||||
aligned_codes_compression_factor = output_size // mel_codes.shape[-1]
|
aligned_codes_compression_factor = output_size // mel_codes.shape[-1]
|
||||||
padded_size = ceil_multiple(output_size, 2048)
|
padded_size = ceil_multiple(output_size, self.model.alignment_size)
|
||||||
padding_added = padded_size - output_size
|
padding_added = padded_size - output_size
|
||||||
padding_needed_for_codes = padding_added // aligned_codes_compression_factor
|
padding_needed_for_codes = padding_added // aligned_codes_compression_factor
|
||||||
if padding_needed_for_codes > 0:
|
if padding_needed_for_codes > 0:
|
||||||
mel_codes = F.pad(mel_codes, (0, padding_needed_for_codes))
|
mel_codes = F.pad(mel_codes, (0, padding_needed_for_codes))
|
||||||
output_shape = (1, 1, padded_size)
|
output_shape = (1, 100, padded_size)
|
||||||
gen_mel = self.diffuser.p_sample_loop(self.model, output_shape,
|
gen_mel = self.diffuser.p_sample_loop(self.model, output_shape,
|
||||||
model_kwargs={'aligned_conditioning': mel_codes,
|
model_kwargs={'aligned_conditioning': mel_codes,
|
||||||
'conditioning_input': univnet_mel})
|
'conditioning_input': univnet_mel})
|
||||||
|
|
||||||
gen_wav = self.local_modules['vocoder'](gen_mel)
|
gen_wav = self.local_modules['vocoder'].inference(gen_mel)
|
||||||
return gen_wav, real_resampled, SAMPLE_RATE
|
real_dec = self.local_modules['vocoder'].inference(univnet_mel)
|
||||||
|
return gen_wav.float(), real_dec, SAMPLE_RATE
|
||||||
|
|
||||||
def load_projector(self):
|
def load_projector(self):
|
||||||
"""
|
"""
|
||||||
|
@ -257,9 +258,9 @@ if __name__ == '__main__':
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
from utils.util import load_model_from_config
|
from utils.util import load_model_from_config
|
||||||
|
|
||||||
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts9.yml', 'generator',
|
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_diffusion_tts9_mel.yml', 'generator',
|
||||||
also_load_savepoint=False,
|
also_load_savepoint=False,
|
||||||
load_path='X:\\dlas\\experiments\\train_diffusion_tts9\\models\\7500_generator_ema.pth').cuda()
|
load_path='X:\\dlas\\experiments\\train_diffusion_tts9_mel\\models\\10000_generator_ema.pth').cuda()
|
||||||
opt_eval = {'eval_tsv': 'Y:\\libritts\\test-clean\\transcribed-brief-w2v.tsv', 'diffusion_steps': 100,
|
opt_eval = {'eval_tsv': 'Y:\\libritts\\test-clean\\transcribed-brief-w2v.tsv', 'diffusion_steps': 100,
|
||||||
'conditioning_free': False, 'conditioning_free_k': 1,
|
'conditioning_free': False, 'conditioning_free_k': 1,
|
||||||
'diffusion_schedule': 'linear', 'diffusion_type': 'tts9_mel'}
|
'diffusion_schedule': 'linear', 'diffusion_type': 'tts9_mel'}
|
||||||
|
|
Loading…
Reference in New Issue
Block a user