forked from mrq/DL-Art-School
more stuff
This commit is contained in:
parent
d4218d8443
commit
45804177b8
codes
data/audio
models/audio/tts
trainer/injectors
|
@ -260,7 +260,7 @@ class FastPairedVoiceDebugger:
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
batch_sz = 256
|
batch_sz = 16
|
||||||
params = {
|
params = {
|
||||||
'mode': 'fast_paired_voice_audio',
|
'mode': 'fast_paired_voice_audio',
|
||||||
'path': ['y:/libritts/train-other-500/transcribed-oco.tsv',
|
'path': ['y:/libritts/train-other-500/transcribed-oco.tsv',
|
||||||
|
@ -268,20 +268,19 @@ if __name__ == '__main__':
|
||||||
'y:/libritts/train-clean-360/transcribed-oco.tsv',
|
'y:/libritts/train-clean-360/transcribed-oco.tsv',
|
||||||
'y:/clips/books1/transcribed-w2v.tsv',
|
'y:/clips/books1/transcribed-w2v.tsv',
|
||||||
'y:/clips/books2/transcribed-w2v.tsv',
|
'y:/clips/books2/transcribed-w2v.tsv',
|
||||||
'y:/bigasr_dataset/hifi_tts/transcribed-w2v.tsv'],
|
'y:/bigasr_dataset/hifi_tts/transcribed-w2v.tsv',
|
||||||
|
'y:/clips/podcasts-1/transcribed-oco.tsv',],
|
||||||
'phase': 'train',
|
'phase': 'train',
|
||||||
'n_workers': 0,
|
'n_workers': 0,
|
||||||
'batch_size': batch_sz,
|
'batch_size': batch_sz,
|
||||||
'max_wav_length': 163840,
|
'max_wav_length': 220500,
|
||||||
'max_text_length': 200,
|
'max_text_length': 500,
|
||||||
'sample_rate': 22050,
|
'sample_rate': 22050,
|
||||||
'load_conditioning': True,
|
'load_conditioning': True,
|
||||||
'num_conditioning_candidates': 1,
|
'num_conditioning_candidates': 2,
|
||||||
'conditioning_length': 66000,
|
'conditioning_length': 102400,
|
||||||
'use_bpe_tokenizer': False,
|
'use_bpe_tokenizer': True,
|
||||||
'load_aligned_codes': False,
|
'load_aligned_codes': False,
|
||||||
'needs_collate': False,
|
|
||||||
'produce_ctc_metadata': False,
|
|
||||||
}
|
}
|
||||||
from data import create_dataset, create_dataloader
|
from data import create_dataset, create_dataloader
|
||||||
|
|
||||||
|
@ -302,6 +301,8 @@ if __name__ == '__main__':
|
||||||
#max_repeats = max(max_repeats, b['ctc_repeats'].max())
|
#max_repeats = max(max_repeats, b['ctc_repeats'].max())
|
||||||
print(f'{i} {ib} {b["real_text"][ib]}')
|
print(f'{i} {ib} {b["real_text"][ib]}')
|
||||||
save(b, i, ib, 'wav')
|
save(b, i, ib, 'wav')
|
||||||
|
save(b, i, ib, 'conditioning', 0)
|
||||||
|
save(b, i, ib, 'conditioning', 1)
|
||||||
pass
|
pass
|
||||||
if i > 15:
|
if i > 15:
|
||||||
break
|
break
|
||||||
|
|
|
@ -4,12 +4,9 @@ import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from torch import autocast
|
from torch import autocast
|
||||||
from x_transformers import Encoder
|
|
||||||
from x_transformers.x_transformers import RelativePositionBias
|
|
||||||
|
|
||||||
from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear
|
from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear
|
||||||
from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, TimestepBlock
|
from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, TimestepBlock
|
||||||
from models.audio.tts.mini_encoder import AudioMiniEncoder
|
|
||||||
from trainer.networks import register_model
|
from trainer.networks import register_model
|
||||||
from utils.util import checkpoint
|
from utils.util import checkpoint
|
||||||
|
|
||||||
|
@ -189,7 +186,7 @@ class DiffusionTtsFlat(nn.Module):
|
||||||
}
|
}
|
||||||
return groups
|
return groups
|
||||||
|
|
||||||
def forward(self, x, timesteps, aligned_conditioning, conditioning_input, lr_input=None, conditioning_free=False):
|
def forward(self, x, timesteps, aligned_conditioning, conditioning_input, conditioning_free=False):
|
||||||
"""
|
"""
|
||||||
Apply the model to an input batch.
|
Apply the model to an input batch.
|
||||||
|
|
||||||
|
@ -197,7 +194,6 @@ class DiffusionTtsFlat(nn.Module):
|
||||||
:param timesteps: a 1-D batch of timesteps.
|
:param timesteps: a 1-D batch of timesteps.
|
||||||
:param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
|
:param aligned_conditioning: an aligned latent or sequence of tokens providing useful data about the sample to be produced.
|
||||||
:param conditioning_input: a full-resolution audio clip that is used as a reference to the style you want decoded.
|
:param conditioning_input: a full-resolution audio clip that is used as a reference to the style you want decoded.
|
||||||
:param lr_input: for super-sampling models, a guidance audio clip at a lower sampling rate.
|
|
||||||
:param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
|
:param conditioning_free: When set, all conditioning inputs (including tokens and conditioning_input) will not be considered.
|
||||||
:return: an [N x C x ...] Tensor of outputs.
|
:return: an [N x C x ...] Tensor of outputs.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -132,7 +132,7 @@ class DiscreteTokenInjector(Injector):
|
||||||
super().__init__(opt, env)
|
super().__init__(opt, env)
|
||||||
cfg = opt_get(opt, ['dvae_config'], "../experiments/train_diffusion_vocoder_22k_level.yml")
|
cfg = opt_get(opt, ['dvae_config'], "../experiments/train_diffusion_vocoder_22k_level.yml")
|
||||||
dvae_name = opt_get(opt, ['dvae_name'], 'dvae')
|
dvae_name = opt_get(opt, ['dvae_name'], 'dvae')
|
||||||
self.dvae = load_model_from_config(cfg, dvae_name, device=env['device']).eval()
|
self.dvae = load_model_from_config(cfg, dvae_name, device=f'cuda:{env["device"]}').eval()
|
||||||
|
|
||||||
def forward(self, state):
|
def forward(self, state):
|
||||||
inp = state[self.input]
|
inp = state[self.input]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user