diff --git a/codes/scripts/audio/speech_synthesis_utils.py b/codes/scripts/audio/speech_synthesis_utils.py index 046c1af6..af30b307 100644 --- a/codes/scripts/audio/speech_synthesis_utils.py +++ b/codes/scripts/audio/speech_synthesis_utils.py @@ -11,11 +11,11 @@ from trainer.injectors.base_injectors import TorchMelSpectrogramInjector from utils.audio import plot_spectrogram -def wav_to_mel(wav): +def wav_to_mel(wav, mel_norms_file='../experiments/clips_mel_norms.pth'): """ Converts an audio clip into a MEL tensor that the vocoder, DVAE and GptTts models use whenever a MEL is called for. """ - return TorchMelSpectrogramInjector({'in': 'wav', 'out': 'mel'},{})({'wav': wav})['mel'] + return TorchMelSpectrogramInjector({'in': 'wav', 'out': 'mel', 'mel_norm_file': mel_norms_file},{})({'wav': wav})['mel'] def convert_mel_to_codes(dvae_model, mel): diff --git a/codes/trainer/injectors/base_injectors.py b/codes/trainer/injectors/base_injectors.py index e909a31a..d82a8a84 100644 --- a/codes/trainer/injectors/base_injectors.py +++ b/codes/trainer/injectors/base_injectors.py @@ -632,7 +632,7 @@ class TorchMelSpectrogramInjector(Injector): def test_torch_mel_injector(): a = load_audio('D:\\data\\audio\\libritts\\train-clean-100\\19\\198\\19_198_000000_000000.wav', 22050) - inj = TorchMelSpectrogramInjector({'in': 'in', 'out': 'out'}, {}) + inj = TorchMelSpectrogramInjector({'in': 'in', 'out': 'out', 'mel_norm_file': '../experiments/clips_mel_norms.pth'}, {}) f = inj({'in': a.unsqueeze(0)})['out'] plot_spectrogram(f[0]) inj = MelSpectrogramInjector({'in': 'in', 'out': 'out'}, {})