2021-12-10 05:59:56 +00:00
import argparse
import torchaudio
from data . audio . unsupervised_audio_dataset import load_audio
2021-12-17 03:47:37 +00:00
from scripts . audio . gen . speech_synthesis_utils import do_spectrogram_diffusion , \
2021-12-10 05:59:56 +00:00
load_discrete_vocoder_diffuser , wav_to_mel , convert_mel_to_codes
from utils . audio import plot_spectrogram
from utils . util import load_model_from_config
def roundtrip_vocoding ( dvae , vocoder , diffuser , clip , cond = None , plot_spec = False ) :
clip = clip . unsqueeze ( 0 )
if cond is None :
cond = clip
else :
cond = cond . unsqueeze ( 0 )
mel = wav_to_mel ( clip )
if plot_spec :
plot_spectrogram ( mel [ 0 ] . cpu ( ) )
codes = convert_mel_to_codes ( dvae , mel )
2022-01-20 18:28:50 +00:00
return do_spectrogram_diffusion ( vocoder , dvae , diffuser , codes , cond , spectrogram_compression_factor = 256 , plt_spec = plot_spec )
2021-12-10 05:59:56 +00:00
if __name__ == ' __main__ ' :
parser = argparse . ArgumentParser ( )
2022-01-20 18:28:50 +00:00
parser . add_argument ( ' -opt ' , type = str , help = ' Path to options YAML file used to train the diffusion model ' ,
default = ' X: \\ dlas \\ experiments \\ train_diffusion_vocoder_22k_level.yml ' )
2021-12-10 05:59:56 +00:00
parser . add_argument ( ' -diffusion_model_name ' , type = str , help = ' Name of the diffusion model in opt. ' , default = ' generator ' )
2022-01-20 18:28:50 +00:00
parser . add_argument ( ' -diffusion_model_path ' , type = str , help = ' Diffusion model checkpoint to load. ' , default = ' X: \\ dlas \\ experiments \\ train_diffusion_vocoder_22k_level \\ models \\ 2500_generator.pth ' )
2021-12-10 05:59:56 +00:00
parser . add_argument ( ' -dvae_model_name ' , type = str , help = ' Name of the DVAE model in opt. ' , default = ' dvae ' )
2022-01-20 18:28:50 +00:00
parser . add_argument ( ' -input_file ' , type = str , help = ' Path to the input audio file. ' , default = ' Y: \\ clips \\ books1 \\ 3_dchha04 Romancing The Tribes \\ 00036.wav ' )
parser . add_argument ( ' -cond ' , type = str , help = ' Path to the conditioning input audio file. ' , default = ' Y: \\ clips \\ books1 \\ 3042_18_Holden__000000000 \\ 00037.wav ' )
2021-12-10 05:59:56 +00:00
args = parser . parse_args ( )
print ( " Loading DVAE.. " )
dvae = load_model_from_config ( args . opt , args . dvae_model_name )
print ( " Loading Diffusion Model.. " )
diffusion = load_model_from_config ( args . opt , args . diffusion_model_name , also_load_savepoint = False , load_path = args . diffusion_model_path )
print ( " Loading data.. " )
diffuser = load_discrete_vocoder_diffuser ( )
inp = load_audio ( args . input_file , 22050 ) . cuda ( )
2021-12-17 03:47:37 +00:00
cond = inp if args . cond is None else load_audio ( args . cond , 22050 )
if cond . shape [ - 1 ] > 44100 + 10000 :
cond = cond [ : , 10000 : 54100 ]
2022-01-20 18:28:50 +00:00
cond = cond . cuda ( )
2021-12-10 05:59:56 +00:00
print ( " Performing inference.. " )
roundtripped = roundtrip_vocoding ( dvae , diffusion , diffuser , inp , cond ) . cpu ( )
2022-01-20 18:28:50 +00:00
torchaudio . save ( ' roundtrip_vocoded_output.wav ' , roundtripped . squeeze ( 0 ) , 22050 )