DL-Art-School/codes/scripts/audio/test_audio_segmentor.py
2022-03-15 11:06:25 -06:00

105 lines
4.0 KiB
Python

import os.path as osp
import logging
import random
import argparse
import audio2numpy
from munch import munchify
import utils
import utils.options as option
import utils.util as util
from data.audio.nv_tacotron_dataset import save_mel_buffer_to_file
from models.audio.tts.tacotron2 import hparams
from models.audio.tts.tacotron2 import TacotronSTFT
from models.audio.tts.tacotron2 import sequence_to_text
from scripts.audio.use_vocoder import Vocoder
from trainer.ExtensibleTrainer import ExtensibleTrainer
import torch
import numpy as np
from scipy.io import wavfile
def forward_pass(model, data, output_dir, opt, b):
with torch.no_grad():
model.feed_data(data, 0)
model.test()
if 'real_text' in opt['eval'].keys():
real = data[opt['eval']['real_text']][0]
print(f'{b} Real text: "{real}"')
pred_seq = model.eval_state[opt['eval']['gen_text']][0]
pred_text = [sequence_to_text(ts) for ts in pred_seq]
audio = model.eval_state[opt['eval']['audio']][0].cpu().numpy()
wavfile.write(osp.join(output_dir, f'{b}_clip.wav'), 22050, audio)
for i, text in enumerate(pred_text):
print(f'{b} Predicted text {i}: "{text}"')
if __name__ == "__main__":
input_file = "E:\\audio\\books\\Roald Dahl Audiobooks\\Roald Dahl - The BFG\\(Roald Dahl) The BFG - 07.mp3"
config = "../options/train_gpt_stop_libritts.yml"
cutoff_pred_percent = .2
# Set seeds
torch.manual_seed(5555)
random.seed(5555)
np.random.seed(5555)
#### options
torch.backends.cudnn.benchmark = True
want_metrics = False
parser = argparse.ArgumentParser()
parser.add_argument('-opt', type=str, help='Path to options YAML file.', default=config)
opt = option.parse(parser.parse_args().opt, is_train=False)
opt = option.dict_to_nonedict(opt)
utils.util.loaded_options = opt
hp = munchify(hparams.create_hparams())
util.mkdirs(
(path for key, path in opt['path'].items()
if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
screen=True, tofile=True)
logger = logging.getLogger('base')
logger.info(option.dict2str(opt))
model = ExtensibleTrainer(opt)
assert len(model.networks) == 1
model = model.networks[next(iter(model.networks.keys()))].module.to('cuda')
model.eval()
vocoder = Vocoder()
audio, sr = audio2numpy.audio_from_file(input_file)
if len(audio.shape) == 2:
audio = audio[:, 0]
audio = torch.tensor(audio, device='cuda').unsqueeze(0).unsqueeze(0)
audio = torch.nn.functional.interpolate(audio, scale_factor=hp.sampling_rate/sr, mode='nearest').squeeze(1)
stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length, hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin, hp.mel_fmax).to('cuda')
mels = stft.mel_spectrogram(audio)
with torch.no_grad():
sentence_number = 0
last_detection_start = 0
start = 0
clip_size = model.max_mel_frames
while start+clip_size < mels.shape[-1]:
clip = mels[:, :, start:start+clip_size]
pred_starts, pred_ends = model(clip)
pred_ends = torch.nn.functional.sigmoid(pred_ends).squeeze(-1).squeeze(0) # Squeeze off the batch and sigmoid dimensions, leaving only the sequence dimension.
indices = torch.nonzero(pred_ends > cutoff_pred_percent)
for i in indices:
i = i.item()
sentence = mels[0, :, last_detection_start:start+i]
if sentence.shape[-1] > 400 and sentence.shape[-1] < 1600:
save_mel_buffer_to_file(sentence, f'{sentence_number}.npy')
wav = vocoder.transform_mel_to_audio(sentence)
wavfile.write(f'{sentence_number}.wav', 22050, wav[0].cpu().numpy())
sentence_number += 1
last_detection_start = start+i
start += 4
if last_detection_start > start:
start = last_detection_start