2021-07-06 17:11:35 +00:00
import os
import random
2021-08-11 19:34:31 +00:00
import audio2numpy
2021-07-06 17:11:35 +00:00
import numpy as np
import torch
import torch . utils . data
2021-08-12 21:44:55 +00:00
import torch . nn . functional as F
2021-07-28 02:33:30 +00:00
from tqdm import tqdm
2021-07-06 17:11:35 +00:00
import models . tacotron2 . layers as layers
from models . tacotron2 . taco_utils import load_wav_to_torch , load_filepaths_and_text
from models . tacotron2 . text import text_to_sequence
2021-07-28 02:33:30 +00:00
from utils . util import opt_get
2021-07-06 17:11:35 +00:00
2021-08-11 19:34:31 +00:00
def load_mozilla_cv ( filename ) :
with open ( filename , encoding = ' utf-8 ' ) as f :
components = [ line . strip ( ) . split ( ' \t ' ) for line in f ] [ 1 : ] # First line is the header
2021-08-14 22:04:05 +00:00
base = os . path . dirname ( filename )
filepaths_and_text = [ [ os . path . join ( base , f ' clips/ { component [ 1 ] } ' ) , component [ 2 ] ] for component in components ]
2021-08-11 19:34:31 +00:00
return filepaths_and_text
2021-08-16 23:13:40 +00:00
def load_voxpopuli ( filename ) :
with open ( filename , encoding = ' utf-8 ' ) as f :
lines = [ line . strip ( ) . split ( ' \t ' ) for line in f ] [ 1 : ] # First line is the header
base = os . path . dirname ( filename )
filepaths_and_text = [ ]
for line in lines :
if len ( line ) == 0 :
continue
file , raw_text , norm_text , speaker_id , split , gender = line
year = file [ : 4 ]
2021-08-17 04:52:05 +00:00
filepaths_and_text . append ( [ os . path . join ( base , year , f ' { file } .ogg ' ) , raw_text ] )
2021-08-16 23:13:40 +00:00
return filepaths_and_text
2021-07-06 17:11:35 +00:00
class TextMelLoader ( torch . utils . data . Dataset ) :
"""
1 ) loads audio , text pairs
2 ) normalizes text and converts them to sequences of one - hot vectors
3 ) computes mel - spectrograms from audio files .
"""
def __init__ ( self , hparams ) :
2021-08-14 22:04:05 +00:00
self . path = hparams [ ' path ' ]
if not isinstance ( self . path , list ) :
self . path = [ self . path ]
2021-08-11 19:34:31 +00:00
fetcher_mode = opt_get ( hparams , [ ' fetcher_mode ' ] , ' lj ' )
2021-08-14 22:04:05 +00:00
if not isinstance ( fetcher_mode , list ) :
fetcher_mode = [ fetcher_mode ]
assert len ( self . path ) == len ( fetcher_mode )
self . audiopaths_and_text = [ ]
for p , fm in zip ( self . path , fetcher_mode ) :
if fm == ' lj ' or fm == ' libritts ' :
fetcher_fn = load_filepaths_and_text
elif fm == ' mozilla_cv ' :
fetcher_fn = load_mozilla_cv
2021-08-16 23:13:40 +00:00
elif fm == ' voxpopuli ' :
fetcher_fn = load_voxpopuli
2021-08-14 22:04:05 +00:00
else :
raise NotImplementedError ( )
self . audiopaths_and_text . extend ( fetcher_fn ( p ) )
2021-07-06 17:11:35 +00:00
self . text_cleaners = hparams . text_cleaners
self . sampling_rate = hparams . sampling_rate
2021-08-15 05:38:15 +00:00
self . load_mel_from_disk = opt_get ( hparams , [ ' load_mel_from_disk ' ] , False )
2021-07-28 02:33:30 +00:00
self . return_wavs = opt_get ( hparams , [ ' return_wavs ' ] , False )
self . input_sample_rate = opt_get ( hparams , [ ' input_sample_rate ' ] , self . sampling_rate )
2021-07-26 22:27:31 +00:00
assert not ( self . load_mel_from_disk and self . return_wavs )
2021-07-06 17:11:35 +00:00
self . stft = layers . TacotronSTFT (
hparams . filter_length , hparams . hop_length , hparams . win_length ,
hparams . n_mel_channels , hparams . sampling_rate , hparams . mel_fmin ,
hparams . mel_fmax )
random . seed ( hparams . seed )
random . shuffle ( self . audiopaths_and_text )
2021-08-12 21:44:55 +00:00
self . max_mel_len = opt_get ( hparams , [ ' max_mel_length ' ] , None )
self . max_text_len = opt_get ( hparams , [ ' max_text_length ' ] , None )
# If needs_collate=False, all outputs will be aligned and padded at maximum length.
self . needs_collate = opt_get ( hparams , [ ' needs_collate ' ] , True )
if not self . needs_collate :
assert self . max_mel_len is not None and self . max_text_len is not None
2021-07-06 17:11:35 +00:00
def get_mel_text_pair ( self , audiopath_and_text ) :
# separate filename and text
audiopath , text = audiopath_and_text [ 0 ] , audiopath_and_text [ 1 ]
2021-08-14 20:37:17 +00:00
text_seq = self . get_text ( text )
2021-07-06 17:11:35 +00:00
mel = self . get_mel ( audiopath )
2021-08-14 20:37:17 +00:00
return ( text_seq , mel , text , audiopath_and_text [ 0 ] )
2021-07-06 17:11:35 +00:00
def get_mel ( self , filename ) :
2021-08-15 05:38:15 +00:00
if self . load_mel_from_disk and os . path . exists ( f ' { filename } _mel.npy ' ) :
melspec = torch . from_numpy ( np . load ( f ' { filename } _mel.npy ' ) )
assert melspec . size ( 0 ) == self . stft . n_mel_channels , (
' Mel dimension mismatch: given {} , expected {} ' . format ( melspec . size ( 0 ) , self . stft . n_mel_channels ) )
else :
2021-08-11 19:34:31 +00:00
if filename . endswith ( ' .wav ' ) :
audio , sampling_rate = load_wav_to_torch ( filename )
2021-10-31 20:29:23 +00:00
elif filename . endswith ( ' .mp3 ' ) :
# https://github.com/neonbjb/pyfastmp3decoder - Definitely worth it.
from pyfastmp3decoder . mp3decoder import load_mp3
audio , sampling_rate = load_mp3 ( filename , self . input_sample_rate )
audio = torch . FloatTensor ( audio )
2021-08-11 19:34:31 +00:00
else :
audio , sampling_rate = audio2numpy . audio_from_file ( filename )
audio = torch . tensor ( audio )
2021-07-26 23:44:06 +00:00
if sampling_rate != self . input_sample_rate :
2021-08-11 22:17:03 +00:00
if sampling_rate < self . input_sample_rate :
print ( f ' { filename } has a sample rate of { sampling_rate } which is lower than the requested sample rate of { self . input_sample_rate } . This is not a good idea. ' )
2021-08-15 02:42:01 +00:00
audio_norm = torch . nn . functional . interpolate ( audio . unsqueeze ( 0 ) . unsqueeze ( 1 ) , scale_factor = self . input_sample_rate / sampling_rate , mode = ' nearest ' , recompute_scale_factor = False ) . squeeze ( )
else :
audio_norm = audio
if audio_norm . std ( ) > 1 :
print ( f " Something is very wrong with the given audio. std_dev= { audio_norm . std ( ) } . file= { filename } " )
2021-08-13 15:36:31 +00:00
return None
2021-08-15 02:42:01 +00:00
audio_norm . clip_ ( - 1 , 1 )
audio_norm = audio_norm . unsqueeze ( 0 )
2021-07-06 17:11:35 +00:00
audio_norm = torch . autograd . Variable ( audio_norm , requires_grad = False )
2021-07-26 23:44:06 +00:00
if self . input_sample_rate != self . sampling_rate :
ratio = self . sampling_rate / self . input_sample_rate
audio_norm = torch . nn . functional . interpolate ( audio_norm . unsqueeze ( 0 ) , scale_factor = ratio , mode = ' area ' ) . squeeze ( 0 )
2021-07-26 22:27:31 +00:00
if self . return_wavs :
melspec = audio_norm
else :
melspec = self . stft . mel_spectrogram ( audio_norm )
melspec = torch . squeeze ( melspec , 0 )
2021-07-06 17:11:35 +00:00
return melspec
def get_text ( self , text ) :
text_norm = torch . IntTensor ( text_to_sequence ( text , self . text_cleaners ) )
return text_norm
def __getitem__ ( self , index ) :
2021-08-14 20:37:17 +00:00
tseq , mel , text , path = self . get_mel_text_pair ( self . audiopaths_and_text [ index ] )
if mel is None or \
( self . max_mel_len is not None and mel . shape [ - 1 ] > self . max_mel_len ) or \
( self . max_text_len is not None and tseq . shape [ 0 ] > self . max_text_len ) :
2021-08-14 23:18:55 +00:00
#if mel is not None:
# print(f"Exception {index} mel_len:{mel.shape[-1]} text_len:{tseq.shape[0]} fname: {path}")
2021-08-11 22:17:03 +00:00
# It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
2021-08-12 21:22:34 +00:00
rv = random . randint ( 0 , len ( self ) - 1 )
2021-08-11 22:17:03 +00:00
return self [ rv ]
2021-08-14 20:37:17 +00:00
orig_output = mel . shape [ - 1 ]
orig_text_len = tseq . shape [ 0 ]
2021-08-12 21:44:55 +00:00
if not self . needs_collate :
2021-08-14 20:37:17 +00:00
if mel . shape [ - 1 ] != self . max_mel_len :
mel = F . pad ( mel , ( 0 , self . max_mel_len - mel . shape [ - 1 ] ) )
if tseq . shape [ 0 ] != self . max_text_len :
tseq = F . pad ( tseq , ( 0 , self . max_text_len - tseq . shape [ 0 ] ) )
2021-08-12 21:51:23 +00:00
return {
2021-08-14 20:37:17 +00:00
' real_text ' : text ,
' padded_text ' : tseq ,
2021-08-12 21:51:23 +00:00
' input_lengths ' : torch . tensor ( orig_text_len , dtype = torch . long ) ,
2021-08-14 20:37:17 +00:00
' padded_mel ' : mel ,
2021-08-12 21:51:23 +00:00
' output_lengths ' : torch . tensor ( orig_output , dtype = torch . long ) ,
2021-08-14 20:37:17 +00:00
' filenames ' : path
2021-08-12 21:51:23 +00:00
}
2021-08-14 20:37:17 +00:00
return tseq , mel , path , text
2021-07-06 17:11:35 +00:00
def __len__ ( self ) :
return len ( self . audiopaths_and_text )
class TextMelCollate ( ) :
""" Zero-pads model inputs and targets based on number of frames per setep
"""
def __init__ ( self , n_frames_per_step ) :
self . n_frames_per_step = n_frames_per_step
def __call__ ( self , batch ) :
""" Collate ' s training batch from normalized text and mel-spectrogram
PARAMS
- - - - - -
2021-08-04 06:44:04 +00:00
batch : [ text_normalized , mel_normalized , filename ]
2021-07-06 17:11:35 +00:00
"""
# Right zero-pad all one-hot text sequences to max input length
input_lengths , ids_sorted_decreasing = torch . sort (
torch . LongTensor ( [ len ( x [ 0 ] ) for x in batch ] ) ,
dim = 0 , descending = True )
max_input_len = input_lengths [ 0 ]
text_padded = torch . LongTensor ( len ( batch ) , max_input_len )
text_padded . zero_ ( )
2021-08-08 17:38:52 +00:00
filenames = [ ]
2021-08-14 20:37:17 +00:00
real_text = [ ]
2021-07-06 17:11:35 +00:00
for i in range ( len ( ids_sorted_decreasing ) ) :
text = batch [ ids_sorted_decreasing [ i ] ] [ 0 ]
text_padded [ i , : text . size ( 0 ) ] = text
2021-08-08 17:38:52 +00:00
filenames . append ( batch [ ids_sorted_decreasing [ i ] ] [ 2 ] )
2021-08-14 20:37:17 +00:00
real_text . append ( batch [ ids_sorted_decreasing [ i ] ] [ 3 ] )
2021-07-06 17:11:35 +00:00
# Right zero-pad mel-spec
num_mels = batch [ 0 ] [ 1 ] . size ( 0 )
max_target_len = max ( [ x [ 1 ] . size ( 1 ) for x in batch ] )
if max_target_len % self . n_frames_per_step != 0 :
max_target_len + = self . n_frames_per_step - max_target_len % self . n_frames_per_step
assert max_target_len % self . n_frames_per_step == 0
# include mel padded and gate padded
mel_padded = torch . FloatTensor ( len ( batch ) , num_mels , max_target_len )
mel_padded . zero_ ( )
gate_padded = torch . FloatTensor ( len ( batch ) , max_target_len )
gate_padded . zero_ ( )
output_lengths = torch . LongTensor ( len ( batch ) )
for i in range ( len ( ids_sorted_decreasing ) ) :
mel = batch [ ids_sorted_decreasing [ i ] ] [ 1 ]
mel_padded [ i , : , : mel . size ( 1 ) ] = mel
gate_padded [ i , mel . size ( 1 ) - 1 : ] = 1
output_lengths [ i ] = mel . size ( 1 )
2021-07-09 04:13:44 +00:00
return {
' padded_text ' : text_padded ,
' input_lengths ' : input_lengths ,
' padded_mel ' : mel_padded ,
' padded_gate ' : gate_padded ,
2021-07-20 14:36:46 +00:00
' output_lengths ' : output_lengths ,
2021-08-14 20:37:17 +00:00
' filenames ' : filenames ,
' real_text ' : real_text ,
2021-07-09 04:13:44 +00:00
}
2021-07-06 17:11:35 +00:00
2021-08-14 04:46:41 +00:00
def save_mel_buffer_to_file ( mel , path ) :
2021-08-17 15:09:11 +00:00
np . save ( path , mel . cpu ( ) . numpy ( ) )
2021-08-14 04:46:41 +00:00
2021-08-14 00:35:55 +00:00
def dump_mels_to_disk ( ) :
params = {
' mode ' : ' nv_tacotron ' ,
2021-10-31 20:29:23 +00:00
' path ' : [ ' Z: \\ mozcv \\ en \\ train.tsv ' ] ,
' fetcher_mode ' : [ ' mozilla_cv ' ] ,
2021-08-14 00:35:55 +00:00
' phase ' : ' train ' ,
2021-10-31 20:29:23 +00:00
' n_workers ' : 8 ,
2021-08-14 04:46:41 +00:00
' batch_size ' : 1 ,
' needs_collate ' : True ,
2021-10-31 20:29:23 +00:00
' max_mel_length ' : 10000 ,
' max_text_length ' : 1000 ,
2021-08-14 00:35:55 +00:00
#'return_wavs': True,
#'input_sample_rate': 22050,
#'sampling_rate': 8000
}
from data import create_dataset , create_dataloader
ds , c = create_dataset ( params , return_collate = True )
dl = create_dataloader ( ds , params , collate_fn = c )
2021-10-31 20:29:23 +00:00
for b in tqdm ( dl ) :
2021-08-14 00:35:55 +00:00
mels = b [ ' padded_mel ' ]
fnames = b [ ' filenames ' ]
for j , fname in enumerate ( fnames ) :
2021-10-31 20:29:23 +00:00
save_mel_buffer_to_file ( mels [ j ] , f ' { fname } _mel.npy ' )
2021-08-14 00:35:55 +00:00
2021-07-06 17:11:35 +00:00
if __name__ == ' __main__ ' :
2021-08-14 00:35:55 +00:00
dump_mels_to_disk ( )
'''
2021-07-06 17:11:35 +00:00
params = {
' mode ' : ' nv_tacotron ' ,
2021-08-13 15:36:31 +00:00
' path ' : ' E: \\ audio \\ MozillaCommonVoice \\ en \\ train.tsv ' ,
2021-07-26 22:27:31 +00:00
' phase ' : ' train ' ,
2021-08-13 15:36:31 +00:00
' n_workers ' : 12 ,
2021-08-06 18:03:46 +00:00
' batch_size ' : 32 ,
2021-08-11 19:34:31 +00:00
' fetcher_mode ' : ' mozilla_cv ' ,
2021-08-12 21:44:55 +00:00
' needs_collate ' : False ,
' max_mel_length ' : 800 ,
' max_text_length ' : 200 ,
2021-07-28 02:33:30 +00:00
#'return_wavs': True,
#'input_sample_rate': 22050,
#'sampling_rate': 8000
2021-07-06 17:11:35 +00:00
}
2021-07-26 22:27:31 +00:00
from data import create_dataset , create_dataloader
ds , c = create_dataset ( params , return_collate = True )
dl = create_dataloader ( ds , params , collate_fn = c )
2021-07-20 14:36:46 +00:00
i = 0
2021-08-06 18:03:46 +00:00
m = None
2021-08-13 15:36:31 +00:00
for k in range ( 1000 ) :
for i , b in tqdm ( enumerate ( dl ) ) :
continue
pm = b [ ' padded_mel ' ]
pm = torch . nn . functional . pad ( pm , ( 0 , 800 - pm . shape [ - 1 ] ) )
m = pm if m is None else torch . cat ( [ m , pm ] , dim = 0 )
print ( m . mean ( ) , m . std ( ) )
2021-08-14 00:35:55 +00:00
'''