From 8332923f5c43cf8fd75d1351db55d6c64e3100d7 Mon Sep 17 00:00:00 2001
From: James Betker <jbetker@gmail.com>
Date: Tue, 17 Aug 2021 09:09:11 -0600
Subject: [PATCH] Two more tools to test the audio segmentor

---
 codes/data/audio/nv_tacotron_dataset.py       |   2 +-
 codes/models/gpt_voice/gpt_audio_segmentor.py |  22 ++--
 codes/scripts/audio/test_audio_segmentor.py   | 106 ++++++++++++++++++
 codes/scripts/audio/use_vocoder.py            |  28 +++++
 codes/train.py                                |   2 +-
 5 files changed, 148 insertions(+), 12 deletions(-)
 create mode 100644 codes/scripts/audio/test_audio_segmentor.py
 create mode 100644 codes/scripts/audio/use_vocoder.py

diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py
index 5099cdee..bbfedc2d 100644
--- a/codes/data/audio/nv_tacotron_dataset.py
+++ b/codes/data/audio/nv_tacotron_dataset.py
@@ -219,7 +219,7 @@ class TextMelCollate():
 
 
 def save_mel_buffer_to_file(mel, path):
-    np.save(path, mel.numpy())
+    np.save(path, mel.cpu().numpy())
 
 
 def dump_mels_to_disk():
diff --git a/codes/models/gpt_voice/gpt_audio_segmentor.py b/codes/models/gpt_voice/gpt_audio_segmentor.py
index 10276cf9..e801c533 100644
--- a/codes/models/gpt_voice/gpt_audio_segmentor.py
+++ b/codes/models/gpt_voice/gpt_audio_segmentor.py
@@ -64,24 +64,26 @@ class GptSegmentor(nn.Module):
         self.final_norm = nn.LayerNorm(model_dim)
         self.stop_head = nn.Linear(model_dim, 1)
 
-    def forward(self, mel_inputs, termination_points):
+    def forward(self, mel_inputs, termination_points=None):
         mel_emb = self.mel_encoder(mel_inputs)
         mel_emb = mel_emb.permute(0,2,1).contiguous()
         mel_emb = mel_emb + self.mel_pos_embedding(torch.arange(mel_emb.shape[1], device=mel_emb.device))
 
         enc = self.gpt(mel_emb)
-
-        # The MEL gets decimated to 1/4 the size by the encoder, so we need to do the same to the termination points.
-        termination_points = F.interpolate(termination_points.unsqueeze(1), size=mel_emb.shape[1], mode='area').squeeze()
-        termination_points = (termination_points > 0).float()
-
-        # Compute loss
-        b, s, _ = enc.shape
         stop_logits = self.final_norm(enc)
         stop_logits = self.stop_head(stop_logits)
-        loss = F.binary_cross_entropy_with_logits(stop_logits.squeeze(-1), termination_points)
 
-        return loss.mean()
+        if termination_points is not None:
+            # The MEL gets decimated to 1/4 the size by the encoder, so we need to do the same to the termination points.
+            termination_points = F.interpolate(termination_points.unsqueeze(1), size=mel_emb.shape[1], mode='area').squeeze()
+            termination_points = (termination_points > 0).float()
+
+            # Compute loss
+            loss = F.binary_cross_entropy_with_logits(stop_logits.squeeze(-1), termination_points)
+            return loss.mean()
+        else:
+            return stop_logits
+
 
 
 @register_model
diff --git a/codes/scripts/audio/test_audio_segmentor.py b/codes/scripts/audio/test_audio_segmentor.py
new file mode 100644
index 00000000..bfbdcee6
--- /dev/null
+++ b/codes/scripts/audio/test_audio_segmentor.py
@@ -0,0 +1,106 @@
+import os.path as osp
+import logging
+import random
+import argparse
+
+import audio2numpy
+import torchvision
+from munch import munchify
+
+import utils
+import utils.options as option
+import utils.util as util
+from data.audio.nv_tacotron_dataset import save_mel_buffer_to_file
+from models.tacotron2 import hparams
+from models.tacotron2.layers import TacotronSTFT
+from models.tacotron2.text import sequence_to_text
+from scripts.audio.use_vocoder import Vocoder
+from trainer.ExtensibleTrainer import ExtensibleTrainer
+from data import create_dataset, create_dataloader
+from tqdm import tqdm
+import torch
+import numpy as np
+from scipy.io import wavfile
+
+
+def forward_pass(model, data, output_dir, opt, b):
+    with torch.no_grad():
+        model.feed_data(data, 0)
+        model.test()
+
+    if 'real_text' in opt['eval'].keys():
+        real = data[opt['eval']['real_text']][0]
+        print(f'{b} Real text: "{real}"')
+
+    pred_seq = model.eval_state[opt['eval']['gen_text']][0]
+    pred_text = [sequence_to_text(ts) for ts in pred_seq]
+    audio = model.eval_state[opt['eval']['audio']][0].cpu().numpy()
+    wavfile.write(osp.join(output_dir, f'{b}_clip.wav'), 22050, audio)
+    for i, text in enumerate(pred_text):
+        print(f'{b} Predicted text {i}: "{text}"')
+
+
+if __name__ == "__main__":
+    input_file = "E:\\audio\\books\\Roald Dahl Audiobooks\\Roald Dahl - The BFG\\(Roald Dahl) The BFG - 07.mp3"
+    config = "../options/train_gpt_stop_libritts.yml"
+    cutoff_pred_percent = .2
+
+    # Set seeds
+    torch.manual_seed(5555)
+    random.seed(5555)
+    np.random.seed(5555)
+
+    #### options
+    torch.backends.cudnn.benchmark = True
+    want_metrics = False
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-opt', type=str, help='Path to options YAML file.', default=config)
+    opt = option.parse(parser.parse_args().opt, is_train=False)
+    opt = option.dict_to_nonedict(opt)
+    utils.util.loaded_options = opt
+    hp = munchify(hparams.create_hparams())
+
+    util.mkdirs(
+        (path for key, path in opt['path'].items()
+         if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key))
+    util.setup_logger('base', opt['path']['log'], 'test_' + opt['name'], level=logging.INFO,
+                      screen=True, tofile=True)
+    logger = logging.getLogger('base')
+    logger.info(option.dict2str(opt))
+
+    model = ExtensibleTrainer(opt)
+    assert len(model.networks) == 1
+    model = model.networks[next(iter(model.networks.keys()))].module.to('cuda')
+    model.eval()
+
+    vocoder = Vocoder()
+
+    audio, sr = audio2numpy.audio_from_file(input_file)
+    if len(audio.shape) == 2:
+        audio = audio[:, 0]
+    audio = torch.tensor(audio, device='cuda').unsqueeze(0).unsqueeze(0)
+    audio = torch.nn.functional.interpolate(audio, scale_factor=hp.sampling_rate/sr, mode='nearest').squeeze(1)
+    stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length, hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin, hp.mel_fmax).to('cuda')
+    mels = stft.mel_spectrogram(audio)
+
+    with torch.no_grad():
+        sentence_number = 0
+        last_detection_start = 0
+        start = 0
+        clip_size = model.MAX_MEL_FRAMES
+        while start+clip_size < mels.shape[-1]:
+            clip = mels[:, :, start:start+clip_size]
+            preds = torch.nn.functional.sigmoid(model(clip)).squeeze(-1).squeeze(0)  # Squeeze off the batch and sigmoid dimensions, leaving only the sequence dimension.
+            indices = torch.nonzero(preds > cutoff_pred_percent)
+            for i in indices:
+                i = i.item()
+                sentence = mels[0, :, last_detection_start:start+i]
+                if sentence.shape[-1] > 400 and sentence.shape[-1] < 1600:
+                    save_mel_buffer_to_file(sentence, f'{sentence_number}.npy')
+                    wav = vocoder.transform_mel_to_audio(sentence)
+                    wavfile.write(f'{sentence_number}.wav', 22050, wav[0].cpu().numpy())
+                sentence_number += 1
+                last_detection_start = start+i
+            start += 4
+            if last_detection_start > start:
+                start = last_detection_start
diff --git a/codes/scripts/audio/use_vocoder.py b/codes/scripts/audio/use_vocoder.py
new file mode 100644
index 00000000..fecda155
--- /dev/null
+++ b/codes/scripts/audio/use_vocoder.py
@@ -0,0 +1,28 @@
+import numpy
+import torch
+from scipy.io import wavfile
+
+from models.waveglow.waveglow import WaveGlow
+
+
+class Vocoder:
+    def __init__(self):
+        self.model = WaveGlow(n_mel_channels=80, n_flows=12, n_group=8, n_early_size=2, n_early_every=4, WN_config={'n_layers': 8, 'n_channels': 256, 'kernel_size': 3})
+        sd = torch.load('../experiments/waveglow_256channels_universal_v5.pth')
+        self.model.load_state_dict(sd)
+        self.model = self.model.to('cuda')
+        self.model.eval()
+
+    def transform_mel_to_audio(self, mel):
+        if len(mel.shape) == 2:  # Assume it's missing the batch dimension and fix that.
+            mel = mel.unsqueeze(0)
+        with torch.no_grad():
+            return self.model.infer(mel)
+
+
+if __name__ == '__main__':
+    inp = '3.npy'
+    mel = torch.tensor(numpy.load(inp)).to('cuda')
+    vocoder = Vocoder()
+    wav = vocoder.transform_mel_to_audio(mel)
+    wavfile.write(f'{inp}.wav', 22050, wav[0].cpu().numpy())
\ No newline at end of file
diff --git a/codes/train.py b/codes/train.py
index 7b2da295..f129db54 100644
--- a/codes/train.py
+++ b/codes/train.py
@@ -282,7 +282,7 @@ class Trainer:
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_gpt_stop_libritts.yml')
+    parser.add_argument('-opt', type=str, help='Path to option YAML file.', default='../options/train_lrdvae_audio_clips.yml')
     parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher')
     parser.add_argument('--local_rank', type=int, default=0)
     args = parser.parse_args()