some stuff
This commit is contained in:
parent
ab5acead0e
commit
8202b9f39c
|
@ -45,7 +45,7 @@ def is_wav_file(filename):
|
|||
|
||||
|
||||
def is_audio_file(filename):
|
||||
AUDIO_EXTENSIONS = ['.wav', '.mp3', '.wma', 'm4b']
|
||||
AUDIO_EXTENSIONS = ['.wav', '.mp3', '.wma', '.m4b', '.flac']
|
||||
return any(filename.endswith(extension) for extension in AUDIO_EXTENSIONS)
|
||||
|
||||
|
||||
|
|
|
@ -1,10 +1,12 @@
|
|||
import argparse
|
||||
|
||||
import torch
|
||||
import torchaudio
|
||||
|
||||
from data.audio.unsupervised_audio_dataset import load_audio
|
||||
from scripts.audio.gen.speech_synthesis_utils import do_spectrogram_diffusion, \
|
||||
load_discrete_vocoder_diffuser, wav_to_mel, convert_mel_to_codes
|
||||
load_discrete_vocoder_diffuser, wav_to_mel, convert_mel_to_codes, wav_to_univnet_mel, load_univnet_vocoder
|
||||
from trainer.injectors.audio_injectors import denormalize_mel
|
||||
from utils.audio import plot_spectrogram
|
||||
from utils.util import load_model_from_config
|
||||
|
||||
|
@ -24,28 +26,30 @@ def roundtrip_vocoding(dvae, vocoder, diffuser, clip, cond=None, plot_spec=False
|
|||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('-codes_file', type=str, help='Which discretes to decode. Should be a path to a pytorch pickle that simply contains the codes.')
|
||||
parser.add_argument('-cond_file', type=str, help='Path to the input audio file.')
|
||||
parser.add_argument('-opt', type=str, help='Path to options YAML file used to train the diffusion model',
|
||||
default='X:\\dlas\\experiments\\train_diffusion_vocoder_22k_level.yml')
|
||||
default='X:\\dlas\\experiments\\train_diffusion_tts_mel_flat0\\last_train.yml')
|
||||
parser.add_argument('-diffusion_model_name', type=str, help='Name of the diffusion model in opt.', default='generator')
|
||||
parser.add_argument('-diffusion_model_path', type=str, help='Diffusion model checkpoint to load.', default='X:\\dlas\\experiments\\train_diffusion_vocoder_22k_level\\models\\2500_generator.pth')
|
||||
parser.add_argument('-dvae_model_name', type=str, help='Name of the DVAE model in opt.', default='dvae')
|
||||
parser.add_argument('-input_file', type=str, help='Path to the input audio file.', default='Y:\\clips\\books1\\3_dchha04 Romancing The Tribes\\00036.wav')
|
||||
parser.add_argument('-cond', type=str, help='Path to the conditioning input audio file.', default='Y:\\clips\\books1\\3042_18_Holden__000000000\\00037.wav')
|
||||
parser.add_argument('-diffusion_model_path', type=str, help='Diffusion model checkpoint to load.', default='X:\\dlas\\experiments\\train_diffusion_tts_mel_flat0\\models\\114000_generator_ema.pth')
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Loading DVAE..")
|
||||
dvae = load_model_from_config(args.opt, args.dvae_model_name)
|
||||
print("Loading Diffusion Model..")
|
||||
diffusion = load_model_from_config(args.opt, args.diffusion_model_name, also_load_savepoint=False, load_path=args.diffusion_model_path)
|
||||
|
||||
print("Loading data..")
|
||||
diffuser = load_discrete_vocoder_diffuser()
|
||||
inp = load_audio(args.input_file, 22050).cuda()
|
||||
cond = inp if args.cond is None else load_audio(args.cond, 22050)
|
||||
if cond.shape[-1] > 44100+10000:
|
||||
cond = cond[:,10000:54100]
|
||||
cond = cond.cuda()
|
||||
codes = torch.load(args.codes_file)
|
||||
conds = load_audio(args.cond_file, 24000)
|
||||
conds = conds[:,:102400]
|
||||
cond_mel = wav_to_univnet_mel(conds.to('cuda'), do_normalization=False)
|
||||
output_shape = (1,100,codes.shape[-1]*4)
|
||||
|
||||
print("Performing inference..")
|
||||
roundtripped = roundtrip_vocoding(dvae, diffusion, diffuser, inp, cond).cpu()
|
||||
torchaudio.save('roundtrip_vocoded_output.wav', roundtripped.squeeze(0), 22050)
|
||||
print("Loading Diffusion Model..")
|
||||
diffusion = load_model_from_config(args.opt, args.diffusion_model_name, also_load_savepoint=False, load_path=args.diffusion_model_path, strict_load=False).cuda().eval()
|
||||
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=50, schedule='linear', enable_conditioning_free_guidance=True, conditioning_free_k=1)
|
||||
vocoder = load_univnet_vocoder().cuda()
|
||||
|
||||
with torch.no_grad():
|
||||
print("Performing inference..")
|
||||
for i in range(codes.shape[0]):
|
||||
gen_mel = diffuser.p_sample_loop(diffusion, output_shape, model_kwargs={'aligned_conditioning': codes[i].unsqueeze(0), 'conditioning_input': cond_mel})
|
||||
gen_mel = denormalize_mel(gen_mel)
|
||||
genWav = vocoder.inference(gen_mel)
|
||||
torchaudio.save(f'vocoded_{i}.wav', genWav.cpu().squeeze(0), 24000)
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import os.path as osp
|
||||
from glob import glob
|
||||
from random import shuffle
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -63,6 +64,8 @@ class MusicDiffusionFid(evaluator.Evaluator):
|
|||
self.gap_gen_fn = self.gen_freq_gap
|
||||
else:
|
||||
self.gap_gen_fn = self.gen_time_gap
|
||||
elif 'rerender' in mode:
|
||||
self.diffusion_fn = self.perform_rerender
|
||||
self.spec_fn = TorchMelSpectrogramInjector({'n_mel_channels': 256, 'mel_fmax': 22000, 'normalize': True, 'in': 'in', 'out': 'out'}, {})
|
||||
|
||||
def load_data(self, path):
|
||||
|
@ -80,7 +83,7 @@ class MusicDiffusionFid(evaluator.Evaluator):
|
|||
model_kwargs={'aligned_conditioning': mel})
|
||||
gen = pixel_shuffle_1d(gen, 16)
|
||||
|
||||
return gen, real_resampled, self.spec_fn({'in': gen})['out'], mel, sample_rate
|
||||
return gen, real_resampled, normalize_mel(self.spec_fn({'in': gen})['out']), normalize_mel(mel), sample_rate
|
||||
|
||||
def gen_freq_gap(self, mel, band_range=(60,100)):
|
||||
gap_start, gap_end = band_range
|
||||
|
@ -119,6 +122,38 @@ class MusicDiffusionFid(evaluator.Evaluator):
|
|||
|
||||
return gen, real_resampled, normalize_mel(spec), mel, sample_rate
|
||||
|
||||
def perform_rerender(self, audio, sample_rate=22050):
|
||||
if sample_rate != sample_rate:
|
||||
real_resampled = torchaudio.functional.resample(audio, 22050, sample_rate).unsqueeze(0)
|
||||
else:
|
||||
real_resampled = audio
|
||||
audio = audio.unsqueeze(0)
|
||||
|
||||
# Fetch the MEL and mask out the requested bands.
|
||||
mel = self.spec_fn({'in': audio})['out']
|
||||
mel = normalize_mel(mel)
|
||||
|
||||
segments = [(0,10),(10,25),(25,45),(45,60),(60,80),(80,100),(100,130),(130,170),(170,210),(210,256)]
|
||||
shuffle(segments)
|
||||
spec = mel
|
||||
for i, segment in enumerate(segments):
|
||||
mel, mask = self.gen_freq_gap(mel, band_range=segment)
|
||||
# Repair the MEL with the given model.
|
||||
spec = self.diffuser.p_sample_loop_with_guidance(self.model, spec, mask, model_kwargs={'truth': spec})
|
||||
torchvision.utils.save_image((spec.unsqueeze(1) + 1) / 2, f"{i}_rerender.png")
|
||||
|
||||
spec = denormalize_mel(spec)
|
||||
|
||||
# Re-convert the resulting MEL back into audio using the spectrogram decoder.
|
||||
output_shape = (1, 16, audio.shape[-1] // 16)
|
||||
self.spec_decoder = self.spec_decoder.to(audio.device)
|
||||
# Cool fact: we can re-use the diffuser for the spectrogram diffuser since it has the same parametrization.
|
||||
gen = self.diffuser.p_sample_loop(self.spec_decoder, output_shape, noise=torch.zeros(*output_shape, device=audio.device),
|
||||
model_kwargs={'aligned_conditioning': spec})
|
||||
gen = pixel_shuffle_1d(gen, 16)
|
||||
|
||||
return gen, real_resampled, normalize_mel(spec), mel, sample_rate
|
||||
|
||||
def project(self, sample, sample_rate):
|
||||
sample = torchaudio.functional.resample(sample, sample_rate, 22050)
|
||||
mel = self.spec_fn({'in': sample})['out']
|
||||
|
@ -182,12 +217,12 @@ class MusicDiffusionFid(evaluator.Evaluator):
|
|||
|
||||
|
||||
if __name__ == '__main__':
|
||||
diffusion = load_model_from_config('X:\\dlas\\experiments\\train_music_gap_filler.yml', 'generator',
|
||||
diffusion = load_model_from_config('D:\\dlas\\options\\train_music_waveform_gen3.yml', 'generator',
|
||||
also_load_savepoint=False,
|
||||
load_path='X:\\dlas\\experiments\\train_music_gap_filler2\\models\\20500_generator_ema.pth').cuda()
|
||||
opt_eval = {'path': 'Y:\\split\\yt-music-eval', 'diffusion_steps': 50,
|
||||
load_path='D:\\dlas\\experiments\\train_music_waveform_gen\\models\\59000_generator_ema.pth').cuda()
|
||||
opt_eval = {'path': 'Y:\\split\\yt-music-eval', 'diffusion_steps': 400,
|
||||
'conditioning_free': False, 'conditioning_free_k': 1,
|
||||
'diffusion_schedule': 'linear', 'diffusion_type': 'gap_fill_time'}
|
||||
env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 3, 'device': 'cuda', 'opt': {}}
|
||||
'diffusion_schedule': 'linear', 'diffusion_type': 'spec_decode'}
|
||||
env = {'rank': 0, 'base_path': 'D:\\tmp\\test_eval_music', 'step': 20, 'device': 'cuda', 'opt': {}}
|
||||
eval = MusicDiffusionFid(diffusion, opt_eval, env)
|
||||
print(eval.perform_eval())
|
||||
|
|
Loading…
Reference in New Issue
Block a user