diff --git a/codes/data/__init__.py b/codes/data/__init__.py index a33e6142..97246259 100644 --- a/codes/data/__init__.py +++ b/codes/data/__init__.py @@ -1,5 +1,4 @@ """create dataset and dataloader""" -import logging import torch import torch.utils.data from munch import munchify @@ -64,7 +63,7 @@ def create_dataset(dataset_opt, return_collate=False): elif mode == 'nv_tacotron': from data.audio.nv_tacotron_dataset import TextWavLoader as D from data.audio.nv_tacotron_dataset import TextMelCollate as C - from models.tacotron2.hparams import create_hparams + from models.audio.tts.tacotron2 import create_hparams default_params = create_hparams() default_params.update(dataset_opt) dataset_opt = munchify(default_params) @@ -72,13 +71,13 @@ def create_dataset(dataset_opt, return_collate=False): collate = C() elif mode == 'paired_voice_audio': from data.audio.paired_voice_audio_dataset import TextWavLoader as D - from models.tacotron2.hparams import create_hparams + from models.audio.tts.tacotron2 import create_hparams default_params = create_hparams() default_params.update(dataset_opt) dataset_opt = munchify(default_params) elif mode == 'fast_paired_voice_audio': from data.audio.fast_paired_dataset import FastPairedVoiceDataset as D - from models.tacotron2.hparams import create_hparams + from models.audio.tts.tacotron2 import create_hparams default_params = create_hparams() default_params.update(dataset_opt) dataset_opt = munchify(default_params) diff --git a/codes/data/audio/gpt_tts_dataset.py b/codes/data/audio/gpt_tts_dataset.py index 24a7cee0..a872fd67 100644 --- a/codes/data/audio/gpt_tts_dataset.py +++ b/codes/data/audio/gpt_tts_dataset.py @@ -6,9 +6,9 @@ import torch.utils.data from torch import LongTensor from tqdm import tqdm -from models.tacotron2.taco_utils import load_filepaths_and_text -from models.tacotron2.text import symbols -from models.tacotron2.text import text_to_sequence +from models.audio.tts.tacotron2 import load_filepaths_and_text +from models.audio.tts.tacotron2 import symbols +from models.audio.tts.tacotron2 import text_to_sequence class GptTtsDataset(torch.utils.data.Dataset): diff --git a/codes/data/audio/grand_conjoined_dataset.py b/codes/data/audio/grand_conjoined_dataset.py index 45f4bb46..7e56b79d 100644 --- a/codes/data/audio/grand_conjoined_dataset.py +++ b/codes/data/audio/grand_conjoined_dataset.py @@ -1,7 +1,4 @@ import os -import os -import random -import shutil import torch import torch.nn.functional as F @@ -9,19 +6,15 @@ import torch.utils.data import torchaudio from munch import munchify from tqdm import tqdm -from transformers import GPT2TokenizerFast -from data.audio.unsupervised_audio_dataset import load_audio, UnsupervisedAudioDataset +from data.audio.unsupervised_audio_dataset import UnsupervisedAudioDataset from data.text.hf_datasets_wrapper import HfDataset -from data.util import find_files_of_type, is_audio_file -from models.tacotron2.taco_utils import load_filepaths_and_text -from models.tacotron2.text import text_to_sequence from utils.util import opt_get def build_paired_voice_dataset(args): from data.audio.paired_voice_audio_dataset import TextWavLoader as D - from models.tacotron2.hparams import create_hparams + from models.audio.tts.tacotron2 import create_hparams default_params = create_hparams() default_params.update(args) dataset_opt = munchify(default_params) diff --git a/codes/data/audio/nv_tacotron_dataset.py b/codes/data/audio/nv_tacotron_dataset.py index b8adf4b7..f94268cb 100644 --- a/codes/data/audio/nv_tacotron_dataset.py +++ b/codes/data/audio/nv_tacotron_dataset.py @@ -1,5 +1,4 @@ import os -import os import random import torch @@ -10,8 +9,8 @@ from tqdm import tqdm from data.audio.unsupervised_audio_dataset import load_audio from data.util import find_files_of_type, is_audio_file -from models.tacotron2.taco_utils import load_filepaths_and_text -from models.tacotron2.text import text_to_sequence +from models.audio.tts.tacotron2 import load_filepaths_and_text +from models.audio.tts.tacotron2 import text_to_sequence from utils.util import opt_get diff --git a/codes/data/audio/paired_voice_audio_dataset.py b/codes/data/audio/paired_voice_audio_dataset.py index 9cc0c4bf..e4dfd922 100644 --- a/codes/data/audio/paired_voice_audio_dataset.py +++ b/codes/data/audio/paired_voice_audio_dataset.py @@ -1,5 +1,4 @@ import os -import os import random import sys @@ -10,8 +9,8 @@ import torchaudio from tqdm import tqdm from data.audio.unsupervised_audio_dataset import load_audio, load_similar_clips -from models.tacotron2.taco_utils import load_filepaths_and_text -from models.tacotron2.text import text_to_sequence, sequence_to_text +from models.audio.tts.tacotron2 import load_filepaths_and_text +from models.audio.tts.tacotron2 import text_to_sequence, sequence_to_text from utils.util import opt_get diff --git a/codes/data/audio/unsupervised_audio_dataset.py b/codes/data/audio/unsupervised_audio_dataset.py index ab65da62..a89a1c1e 100644 --- a/codes/data/audio/unsupervised_audio_dataset.py +++ b/codes/data/audio/unsupervised_audio_dataset.py @@ -1,8 +1,6 @@ import os -import pathlib import random import sys -from warnings import warn import torch import torch.utils.data @@ -11,9 +9,8 @@ import torchaudio from audio2numpy import open_audio from tqdm import tqdm -from data.audio.wav_aug import WavAugmentor -from data.util import find_files_of_type, is_wav_file, is_audio_file, load_paths_from_cache -from models.tacotron2.taco_utils import load_wav_to_torch +from data.util import find_files_of_type, is_audio_file, load_paths_from_cache +from models.audio.tts.tacotron2 import load_wav_to_torch from utils.util import opt_get @@ -189,7 +186,7 @@ if __name__ == '__main__': 'extra_samples': 4, 'resample_clip': True, } - from data import create_dataset, create_dataloader, util + from data import create_dataset, create_dataloader ds = create_dataset(params) dl = create_dataloader(ds, params) diff --git a/codes/data/audio/voice_tokenizer.py b/codes/data/audio/voice_tokenizer.py index 49b16dba..18188592 100644 --- a/codes/data/audio/voice_tokenizer.py +++ b/codes/data/audio/voice_tokenizer.py @@ -1,16 +1,14 @@ import re -import datasets import torch from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.pre_tokenizers import Whitespace -from tokenizers.processors import ByteLevel from tokenizers.trainers import BpeTrainer from data.audio.paired_voice_audio_dataset import load_mozilla_cv, load_voxpopuli, load_tsv -from models.tacotron2.taco_utils import load_filepaths_and_text -from models.tacotron2.text.cleaners import english_cleaners +from models.audio.tts.tacotron2 import load_filepaths_and_text +from models.audio.tts.tacotron2.text.cleaners import english_cleaners def remove_extraneous_punctuation(word): diff --git a/codes/data/audio/wav_aug.py b/codes/data/audio/wav_aug.py index 74723a44..e935aa2e 100644 --- a/codes/data/audio/wav_aug.py +++ b/codes/data/audio/wav_aug.py @@ -3,7 +3,7 @@ import random import torch import torchaudio.sox_effects -from models.tacotron2.taco_utils import load_wav_to_torch +from models.audio.tts.tacotron2 import load_wav_to_torch # Returns random double on [l,h] as a string diff --git a/codes/models/asr/__init__.py b/codes/models/audio/asr/__init__.py similarity index 100% rename from codes/models/asr/__init__.py rename to codes/models/audio/asr/__init__.py diff --git a/codes/models/gpt_voice/__init__.py b/codes/models/audio/tts/__init__.py similarity index 100% rename from codes/models/gpt_voice/__init__.py rename to codes/models/audio/tts/__init__.py diff --git a/codes/models/gpt_voice/ctc_code_generator.py b/codes/models/audio/tts/ctc_code_generator.py similarity index 97% rename from codes/models/gpt_voice/ctc_code_generator.py rename to codes/models/audio/tts/ctc_code_generator.py index 95136c67..653ffcbe 100644 --- a/codes/models/gpt_voice/ctc_code_generator.py +++ b/codes/models/audio/tts/ctc_code_generator.py @@ -3,11 +3,11 @@ import json import torch import torch.nn as nn import torch.nn.functional as F -from x_transformers import Encoder, XTransformer, TransformerWrapper +from x_transformers import Encoder, TransformerWrapper -from models.gpt_voice.unet_diffusion_tts6 import CheckpointedLayer -from models.gpt_voice.unified_voice2 import ConditioningEncoder -from models.tacotron2.text.cleaners import english_cleaners +from models.audio.tts.unet_diffusion_tts6 import CheckpointedLayer +from models.audio.tts.unified_voice2 import ConditioningEncoder +from models.audio.tts.tacotron2.text.cleaners import english_cleaners from trainer.networks import register_model from utils.util import opt_get diff --git a/codes/models/gpt_voice/ctc_code_generator2.py b/codes/models/audio/tts/ctc_code_generator2.py similarity index 91% rename from codes/models/gpt_voice/ctc_code_generator2.py rename to codes/models/audio/tts/ctc_code_generator2.py index 24fa37a1..694afaee 100644 --- a/codes/models/gpt_voice/ctc_code_generator2.py +++ b/codes/models/audio/tts/ctc_code_generator2.py @@ -4,17 +4,11 @@ import json import torch import torch.nn as nn import torch.nn.functional as F -from torch.nn import CrossEntropyLoss -from transformers import T5Config, T5Model, T5PreTrainedModel, T5ForConditionalGeneration -from transformers.file_utils import replace_return_docstrings -from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput -from transformers.utils.model_parallel_utils import get_device_map, assert_device_map -from x_transformers import Encoder, XTransformer +from transformers import T5Config, T5ForConditionalGeneration -from models.gpt_voice.transformer_builders import null_position_embeddings -from models.gpt_voice.unet_diffusion_tts6 import CheckpointedLayer -from models.gpt_voice.unified_voice2 import ConditioningEncoder -from models.tacotron2.text.cleaners import english_cleaners +from models.audio.tts.transformer_builders import null_position_embeddings +from models.audio.tts.unified_voice2 import ConditioningEncoder +from models.audio.tts.tacotron2.text.cleaners import english_cleaners from trainer.networks import register_model from utils.util import opt_get @@ -146,7 +140,6 @@ def inf(): model.load_state_dict(sd) raw_batch = torch.load('raw_batch.pth') with torch.no_grad(): - from data.audio.unsupervised_audio_dataset import load_audio from scripts.audio.gen.speech_synthesis_utils import wav_to_mel ref_mel = torch.cat([wav_to_mel(raw_batch['conditioning'][0])[:, :, :256], wav_to_mel(raw_batch['conditioning'][0])[:, :, :256]], dim=0).unsqueeze(0) diff --git a/codes/models/gpt_voice/lucidrains_dvae.py b/codes/models/audio/tts/lucidrains_dvae.py similarity index 100% rename from codes/models/gpt_voice/lucidrains_dvae.py rename to codes/models/audio/tts/lucidrains_dvae.py diff --git a/codes/models/gpt_voice/mini_encoder.py b/codes/models/audio/tts/mini_encoder.py similarity index 100% rename from codes/models/gpt_voice/mini_encoder.py rename to codes/models/audio/tts/mini_encoder.py diff --git a/codes/models/tacotron2/LICENSE b/codes/models/audio/tts/tacotron2/LICENSE similarity index 100% rename from codes/models/tacotron2/LICENSE rename to codes/models/audio/tts/tacotron2/LICENSE diff --git a/codes/models/tacotron2/__init__.py b/codes/models/audio/tts/tacotron2/__init__.py similarity index 100% rename from codes/models/tacotron2/__init__.py rename to codes/models/audio/tts/tacotron2/__init__.py diff --git a/codes/models/tacotron2/audio_processing.py b/codes/models/audio/tts/tacotron2/audio_processing.py similarity index 100% rename from codes/models/tacotron2/audio_processing.py rename to codes/models/audio/tts/tacotron2/audio_processing.py diff --git a/codes/models/tacotron2/hparams.py b/codes/models/audio/tts/tacotron2/hparams.py similarity index 98% rename from codes/models/tacotron2/hparams.py rename to codes/models/audio/tts/tacotron2/hparams.py index eeae2649..b25df857 100644 --- a/codes/models/tacotron2/hparams.py +++ b/codes/models/audio/tts/tacotron2/hparams.py @@ -1,5 +1,5 @@ #import tensorflow as tf -from models.tacotron2.text import symbols +from models.audio.tts.tacotron2.text import symbols def create_hparams(hparams_string=None, verbose=False): diff --git a/codes/models/tacotron2/layers.py b/codes/models/audio/tts/tacotron2/layers.py similarity index 93% rename from codes/models/tacotron2/layers.py rename to codes/models/audio/tts/tacotron2/layers.py index baaa7fd6..8b69e0df 100644 --- a/codes/models/tacotron2/layers.py +++ b/codes/models/audio/tts/tacotron2/layers.py @@ -1,8 +1,8 @@ import torch from librosa.filters import mel as librosa_mel_fn -from models.tacotron2.audio_processing import dynamic_range_compression -from models.tacotron2.audio_processing import dynamic_range_decompression -from models.tacotron2.stft import STFT +from models.audio.tts.tacotron2.audio_processing import dynamic_range_compression +from models.audio.tts.tacotron2.audio_processing import dynamic_range_decompression +from models.audio.tts.tacotron2.stft import STFT class LinearNorm(torch.nn.Module): diff --git a/codes/models/tacotron2/loss.py b/codes/models/audio/tts/tacotron2/loss.py similarity index 100% rename from codes/models/tacotron2/loss.py rename to codes/models/audio/tts/tacotron2/loss.py diff --git a/codes/models/tacotron2/stft.py b/codes/models/audio/tts/tacotron2/stft.py similarity index 98% rename from codes/models/tacotron2/stft.py rename to codes/models/audio/tts/tacotron2/stft.py index fd7d14f5..f2c3186a 100644 --- a/codes/models/tacotron2/stft.py +++ b/codes/models/audio/tts/tacotron2/stft.py @@ -36,7 +36,7 @@ import torch.nn.functional as F from torch.autograd import Variable from scipy.signal import get_window from librosa.util import pad_center, tiny -from models.tacotron2.audio_processing import window_sumsquare +from models.audio.tts.tacotron2.audio_processing import window_sumsquare class STFT(torch.nn.Module): diff --git a/codes/models/tacotron2/taco_utils.py b/codes/models/audio/tts/tacotron2/taco_utils.py similarity index 100% rename from codes/models/tacotron2/taco_utils.py rename to codes/models/audio/tts/tacotron2/taco_utils.py diff --git a/codes/models/tacotron2/tacotron2.py b/codes/models/audio/tts/tacotron2/tacotron2.py similarity index 98% rename from codes/models/tacotron2/tacotron2.py rename to codes/models/audio/tts/tacotron2/tacotron2.py index 2ea8b06e..73ecc754 100644 --- a/codes/models/tacotron2/tacotron2.py +++ b/codes/models/audio/tts/tacotron2/tacotron2.py @@ -4,11 +4,10 @@ from munch import munchify from torch.autograd import Variable from torch import nn from torch.nn import functional as F -from models.tacotron2.layers import ConvNorm, LinearNorm -from models.tacotron2.hparams import create_hparams +from models.audio.tts.tacotron2.layers import ConvNorm, LinearNorm +from models.audio.tts.tacotron2.hparams import create_hparams from trainer.networks import register_model -from models.tacotron2.taco_utils import get_mask_from_lengths -from utils.util import opt_get, checkpoint +from models.audio.tts.tacotron2.taco_utils import get_mask_from_lengths class LocationLayer(nn.Module): diff --git a/codes/models/tacotron2/text/LICENSE b/codes/models/audio/tts/tacotron2/text/LICENSE similarity index 100% rename from codes/models/tacotron2/text/LICENSE rename to codes/models/audio/tts/tacotron2/text/LICENSE diff --git a/codes/models/tacotron2/text/__init__.py b/codes/models/audio/tts/tacotron2/text/__init__.py similarity index 95% rename from codes/models/tacotron2/text/__init__.py rename to codes/models/audio/tts/tacotron2/text/__init__.py index 34c7c0bc..245432aa 100644 --- a/codes/models/tacotron2/text/__init__.py +++ b/codes/models/audio/tts/tacotron2/text/__init__.py @@ -3,8 +3,8 @@ import re import torch -from models.tacotron2.text import cleaners -from models.tacotron2.text.symbols import symbols +from models.audio.tts.tacotron2.text import cleaners +from models.audio.tts.tacotron2.text.symbols import symbols # Mappings from symbol to numeric ID and vice versa: diff --git a/codes/models/tacotron2/text/cleaners.py b/codes/models/audio/tts/tacotron2/text/cleaners.py similarity index 100% rename from codes/models/tacotron2/text/cleaners.py rename to codes/models/audio/tts/tacotron2/text/cleaners.py diff --git a/codes/models/tacotron2/text/cmudict.py b/codes/models/audio/tts/tacotron2/text/cmudict.py similarity index 100% rename from codes/models/tacotron2/text/cmudict.py rename to codes/models/audio/tts/tacotron2/text/cmudict.py diff --git a/codes/models/tacotron2/text/numbers.py b/codes/models/audio/tts/tacotron2/text/numbers.py similarity index 100% rename from codes/models/tacotron2/text/numbers.py rename to codes/models/audio/tts/tacotron2/text/numbers.py diff --git a/codes/models/tacotron2/text/symbols.py b/codes/models/audio/tts/tacotron2/text/symbols.py similarity index 93% rename from codes/models/tacotron2/text/symbols.py rename to codes/models/audio/tts/tacotron2/text/symbols.py index a88227d1..071395be 100644 --- a/codes/models/tacotron2/text/symbols.py +++ b/codes/models/audio/tts/tacotron2/text/symbols.py @@ -4,7 +4,7 @@ Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' -from models.tacotron2.text import cmudict +from models.audio.tts.tacotron2.text import cmudict _pad = '_' _punctuation = '!\'(),.:;? ' diff --git a/codes/models/tacotron2/wave_tacotron.py b/codes/models/audio/tts/tacotron2/wave_tacotron.py similarity index 97% rename from codes/models/tacotron2/wave_tacotron.py rename to codes/models/audio/tts/tacotron2/wave_tacotron.py index 7a227529..4c02e05e 100644 --- a/codes/models/tacotron2/wave_tacotron.py +++ b/codes/models/audio/tts/tacotron2/wave_tacotron.py @@ -3,16 +3,16 @@ import torch from munch import munchify from torch.autograd import Variable from torch import nn -from torch.nn import functional as F, Flatten +from torch.nn import functional as F from models.arch_util import ConvGnSilu from models.diffusion.unet_diffusion import UNetModel, AttentionPool2d -from models.tacotron2.layers import ConvNorm, LinearNorm -from models.tacotron2.hparams import create_hparams -from models.tacotron2.tacotron2 import Prenet, Attention, Encoder +from models.audio.tts.tacotron2.layers import LinearNorm +from models.audio.tts.tacotron2.hparams import create_hparams +from models.audio.tts.tacotron2.tacotron2 import Attention, Encoder from trainer.networks import register_model -from models.tacotron2.taco_utils import get_mask_from_lengths -from utils.util import opt_get, checkpoint +from models.audio.tts.tacotron2.taco_utils import get_mask_from_lengths +from utils.util import checkpoint diff --git a/codes/models/gpt_voice/transformer_builders.py b/codes/models/audio/tts/transformer_builders.py similarity index 100% rename from codes/models/gpt_voice/transformer_builders.py rename to codes/models/audio/tts/transformer_builders.py diff --git a/codes/models/gpt_voice/unet_diffusion_tts5.py b/codes/models/audio/tts/unet_diffusion_tts5.py similarity index 99% rename from codes/models/gpt_voice/unet_diffusion_tts5.py rename to codes/models/audio/tts/unet_diffusion_tts5.py index 924118b9..0339877d 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts5.py +++ b/codes/models/audio/tts/unet_diffusion_tts5.py @@ -5,12 +5,11 @@ import torch import torch.nn as nn import torch.nn.functional as F from torch import autocast -from x_transformers.x_transformers import AbsolutePositionalEmbedding, AttentionLayers from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, \ Downsample, Upsample, TimestepBlock -from models.gpt_voice.mini_encoder import AudioMiniEncoder +from models.audio.tts.mini_encoder import AudioMiniEncoder from scripts.audio.gen.use_diffuse_tts import ceil_multiple from trainer.networks import register_model from utils.util import checkpoint diff --git a/codes/models/gpt_voice/unet_diffusion_tts6.py b/codes/models/audio/tts/unet_diffusion_tts6.py similarity index 99% rename from codes/models/gpt_voice/unet_diffusion_tts6.py rename to codes/models/audio/tts/unet_diffusion_tts6.py index 6dd59254..c3d76625 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts6.py +++ b/codes/models/audio/tts/unet_diffusion_tts6.py @@ -1,17 +1,15 @@ import functools import random -from collections import OrderedDict import torch import torch.nn as nn import torch.nn.functional as F from torch import autocast -from x_transformers.x_transformers import AbsolutePositionalEmbedding, AttentionLayers from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, \ Downsample, Upsample, TimestepBlock -from models.gpt_voice.mini_encoder import AudioMiniEncoder +from models.audio.tts.mini_encoder import AudioMiniEncoder from scripts.audio.gen.use_diffuse_tts import ceil_multiple from trainer.networks import register_model from utils.util import checkpoint diff --git a/codes/models/gpt_voice/unet_diffusion_tts7.py b/codes/models/audio/tts/unet_diffusion_tts7.py similarity index 99% rename from codes/models/gpt_voice/unet_diffusion_tts7.py rename to codes/models/audio/tts/unet_diffusion_tts7.py index 73126a06..8075cef5 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts7.py +++ b/codes/models/audio/tts/unet_diffusion_tts7.py @@ -1,17 +1,15 @@ import functools import random -from collections import OrderedDict import torch import torch.nn as nn import torch.nn.functional as F from torch import autocast -from x_transformers.x_transformers import AbsolutePositionalEmbedding, AttentionLayers, CrossAttender from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, \ Downsample, Upsample, TimestepBlock -from models.gpt_voice.mini_encoder import AudioMiniEncoder +from models.audio.tts.mini_encoder import AudioMiniEncoder from scripts.audio.gen.use_diffuse_tts import ceil_multiple from trainer.networks import register_model from utils.util import checkpoint diff --git a/codes/models/gpt_voice/unet_diffusion_tts8.py b/codes/models/audio/tts/unet_diffusion_tts8.py similarity index 97% rename from codes/models/gpt_voice/unet_diffusion_tts8.py rename to codes/models/audio/tts/unet_diffusion_tts8.py index 80fa39f3..13c46246 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts8.py +++ b/codes/models/audio/tts/unet_diffusion_tts8.py @@ -1,20 +1,17 @@ import functools import random -from collections import OrderedDict import torch import torch.nn as nn import torch.nn.functional as F from torch import autocast -from x_transformers.x_transformers import AbsolutePositionalEmbedding, AttentionLayers, CrossAttender from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear -from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, \ - Downsample, Upsample, TimestepBlock -from models.gpt_voice.mini_encoder import AudioMiniEncoder +from models.diffusion.unet_diffusion import TimestepEmbedSequential, \ + Downsample, Upsample +from models.audio.tts.mini_encoder import AudioMiniEncoder from scripts.audio.gen.use_diffuse_tts import ceil_multiple from trainer.networks import register_model -from utils.util import checkpoint from x_transformers import Encoder, ContinuousTransformerWrapper diff --git a/codes/models/gpt_voice/unet_diffusion_tts9.py b/codes/models/audio/tts/unet_diffusion_tts9.py similarity index 99% rename from codes/models/gpt_voice/unet_diffusion_tts9.py rename to codes/models/audio/tts/unet_diffusion_tts9.py index d0ac61fa..1eeca530 100644 --- a/codes/models/gpt_voice/unet_diffusion_tts9.py +++ b/codes/models/audio/tts/unet_diffusion_tts9.py @@ -1,6 +1,4 @@ -import functools import random -from collections import OrderedDict import torch import torch.nn as nn @@ -11,11 +9,11 @@ from x_transformers import Encoder from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear from models.diffusion.unet_diffusion import AttentionBlock, TimestepEmbedSequential, \ Downsample, Upsample, TimestepBlock -from models.gpt_voice.mini_encoder import AudioMiniEncoder -from models.gpt_voice.unet_diffusion_tts7 import CheckpointedXTransformerEncoder +from models.audio.tts.mini_encoder import AudioMiniEncoder +from models.audio.tts.unet_diffusion_tts7 import CheckpointedXTransformerEncoder from scripts.audio.gen.use_diffuse_tts import ceil_multiple from trainer.networks import register_model -from utils.util import checkpoint, opt_get +from utils.util import checkpoint def is_latent(t): diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder.py b/codes/models/audio/tts/unet_diffusion_vocoder.py similarity index 100% rename from codes/models/gpt_voice/unet_diffusion_vocoder.py rename to codes/models/audio/tts/unet_diffusion_vocoder.py diff --git a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py b/codes/models/audio/tts/unet_diffusion_vocoder_with_ref.py similarity index 98% rename from codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py rename to codes/models/audio/tts/unet_diffusion_vocoder_with_ref.py index 4dbc8b96..66dfd52b 100644 --- a/codes/models/gpt_voice/unet_diffusion_vocoder_with_ref.py +++ b/codes/models/audio/tts/unet_diffusion_vocoder_with_ref.py @@ -1,13 +1,11 @@ -from models.diffusion.fp16_util import convert_module_to_f32, convert_module_to_f16 from models.diffusion.nn import timestep_embedding, normalization, zero_module, conv_nd, linear -from models.diffusion.unet_diffusion import AttentionPool2d, AttentionBlock, ResBlock, TimestepEmbedSequential, \ +from models.diffusion.unet_diffusion import AttentionBlock, ResBlock, TimestepEmbedSequential, \ Downsample, Upsample import torch import torch.nn as nn -from models.gpt_voice.mini_encoder import AudioMiniEncoder, EmbeddingCombiner +from models.audio.tts.mini_encoder import AudioMiniEncoder from trainer.networks import register_model -from utils.util import get_mask_from_lengths class DiscreteSpectrogramConditioningBlock(nn.Module): diff --git a/codes/models/gpt_voice/unified_voice2.py b/codes/models/audio/tts/unified_voice2.py similarity index 99% rename from codes/models/gpt_voice/unified_voice2.py rename to codes/models/audio/tts/unified_voice2.py index 10fb0483..f4a3deec 100644 --- a/codes/models/gpt_voice/unified_voice2.py +++ b/codes/models/audio/tts/unified_voice2.py @@ -1,15 +1,12 @@ -import functools - import torch import torch.nn as nn import torch.nn.functional as F -from transformers import GPT2Model, GPT2Config, GPT2PreTrainedModel +from transformers import GPT2Config, GPT2PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions from transformers.utils.model_parallel_utils import get_device_map, assert_device_map from models.arch_util import AttentionBlock -from models.gpt_voice.transformer_builders import build_hf_gpt_transformer -from models.tacotron2.text import symbols +from models.audio.tts.transformer_builders import build_hf_gpt_transformer from trainer.networks import register_model from utils.util import opt_get diff --git a/codes/models/gpt_voice/voice_voice_clip.py b/codes/models/audio/tts/voice_voice_clip.py similarity index 96% rename from codes/models/gpt_voice/voice_voice_clip.py rename to codes/models/audio/tts/voice_voice_clip.py index 51b631d6..22fa2d80 100644 --- a/codes/models/gpt_voice/voice_voice_clip.py +++ b/codes/models/audio/tts/voice_voice_clip.py @@ -2,11 +2,9 @@ import random import torch import torch.nn as nn import torch.nn.functional as F -from einops import rearrange from torch import einsum -from models.gpt_voice.mini_encoder import AudioMiniEncoder -from models.lucidrains.dalle.transformer import Transformer +from models.audio.tts.mini_encoder import AudioMiniEncoder from trainer.injectors.spec_augment import spec_augment from trainer.networks import register_model from utils.util import opt_get diff --git a/codes/models/gpt_voice/w2v_matcher.py b/codes/models/audio/tts/w2v_matcher.py similarity index 82% rename from codes/models/gpt_voice/w2v_matcher.py rename to codes/models/audio/tts/w2v_matcher.py index bba17f82..a2d261a6 100644 --- a/codes/models/gpt_voice/w2v_matcher.py +++ b/codes/models/audio/tts/w2v_matcher.py @@ -5,7 +5,7 @@ import torch.nn as nn import torch.nn.functional as F from x_transformers import Encoder, Decoder, ContinuousTransformerWrapper -from models.gpt_voice.mini_encoder import AudioMiniEncoder +from models.audio.tts.mini_encoder import AudioMiniEncoder from trainer.networks import register_model @@ -135,6 +135,31 @@ class Wav2VecMatcher(nn.Module): return ce_loss, mse_loss + def find_matching_w2v_logit(self, key, w2v_logit_iterable): + pass + + def sample(self, text_tokens, conditioning_clip, w2v_logit_iterable, audio_clip_iterable): + text_emb = self.text_embedding(text_tokens) + cond_emb = self.conditioning_encoder(conditioning_clip) + enc_inputs = torch.cat([cond_emb.unsqueeze(1), text_emb], dim=1) + dec_context = self.encoder(enc_inputs) + dec_inputs = self.decoder_start_embedding + count = 0 + while count < 400: + dec_out = self.decoder(dec_inputs, context=dec_context) + + # Check if that was EOS. + l2 = F.mse_loss(dec_out[:,-1], self.decoder_stop_embedding) + if l2 < .1: # TODO: fix threshold. + break + + # Find a matching w2v logit from the given iterable. + matching_logit_index = self.find_matching_w2v_logit(dec_out[:,-1], w2v_logit_iterable) + matching_logit = w2v_logit_iterable[matching_logit_index] + dec_inputs = torch.cat([dec_inputs, self.w2v_value_encoder(matching_logit).unsqueeze(1)], dim=1) + produced_audio = torch.cat([produced_audio, audio_clip_iterable[matching_logit_index]], dim=-1) + return produced_audio + @register_model def register_w2v_matcher(opt_net, opt): diff --git a/codes/models/audio/vocoders/waveglow/denoiser.py b/codes/models/audio/vocoders/waveglow/denoiser.py index 600e1412..a9e05967 100644 --- a/codes/models/audio/vocoders/waveglow/denoiser.py +++ b/codes/models/audio/vocoders/waveglow/denoiser.py @@ -1,6 +1,6 @@ import sys -from models.tacotron2.stft import STFT +from models.audio.tts.tacotron2.stft import STFT sys.path.append('tacotron2') import torch diff --git a/codes/models/clip/text_cond_clip.py b/codes/models/clip/text_cond_clip.py index fbff7148..39085202 100644 --- a/codes/models/clip/text_cond_clip.py +++ b/codes/models/clip/text_cond_clip.py @@ -1,10 +1,9 @@ import torch import torch.nn as nn import torch.nn.functional as F -from einops import rearrange from torch import einsum -from models.gpt_voice.unified_voice2 import ConditioningEncoder +from models.audio.tts.unified_voice2 import ConditioningEncoder from models.lucidrains.dalle.transformer import Transformer from trainer.networks import register_model from utils.util import opt_get diff --git a/codes/models/spleeter/estimator.py b/codes/models/spleeter/estimator.py deleted file mode 100644 index 207c4989..00000000 --- a/codes/models/spleeter/estimator.py +++ /dev/null @@ -1,137 +0,0 @@ -import math - -import torch -import torch.nn.functional as F -from torch import nn -from torch import istft - -from .unet import UNet -from .util import tf2pytorch - - -def load_ckpt(model, ckpt): - state_dict = model.state_dict() - for k, v in ckpt.items(): - if k in state_dict: - target_shape = state_dict[k].shape - assert target_shape == v.shape - state_dict.update({k: torch.from_numpy(v)}) - else: - print('Ignore ', k) - - model.load_state_dict(state_dict) - return model - - -def pad_and_partition(tensor, T): - """ - pads zero and partition tensor into segments of length T - - Args: - tensor(Tensor): BxCxFxL - - Returns: - tensor of size (B*[L/T] x C x F x T) - """ - old_size = tensor.size(3) - new_size = math.ceil(old_size/T) * T - tensor = F.pad(tensor, [0, new_size - old_size]) - [b, c, t, f] = tensor.shape - split = new_size // T - return torch.cat(torch.split(tensor, T, dim=3), dim=0) - - -class Estimator(nn.Module): - def __init__(self, num_instrumments, checkpoint_path): - super(Estimator, self).__init__() - - # stft config - self.F = 1024 - self.T = 512 - self.win_length = 4096 - self.hop_length = 1024 - self.win = torch.hann_window(self.win_length) - - ckpts = tf2pytorch(checkpoint_path, num_instrumments) - - # filter - self.instruments = nn.ModuleList() - for i in range(num_instrumments): - print('Loading model for instrumment {}'.format(i)) - net = UNet(2) - ckpt = ckpts[i] - net = load_ckpt(net, ckpt) - net.eval() # change mode to eval - self.instruments.append(net) - - def compute_stft(self, wav): - """ - Computes stft feature from wav - - Args: - wav (Tensor): B x L - """ - - stft = torch.stft( - wav, self.win_length, hop_length=self.hop_length, window=self.win.to(wav.device)) - - # only keep freqs smaller than self.F - stft = stft[:, :self.F, :, :] - real = stft[:, :, :, 0] - im = stft[:, :, :, 1] - mag = torch.sqrt(real ** 2 + im ** 2) - - return stft, mag - - def inverse_stft(self, stft): - """Inverses stft to wave form""" - - pad = self.win_length // 2 + 1 - stft.size(1) - stft = F.pad(stft, (0, 0, 0, 0, 0, pad)) - wav = istft(stft, self.win_length, hop_length=self.hop_length, - window=self.win.to(stft.device)) - return wav.detach() - - def separate(self, wav): - """ - Separates stereo wav into different tracks corresponding to different instruments - - Args: - wav (tensor): B x L - """ - - # stft - B X F x L x 2 - # stft_mag - B X F x L - stft, stft_mag = self.compute_stft(wav) - - L = stft.size(2) - - stft_mag = stft_mag.unsqueeze(1).repeat(1,2,1,1) # B x 2 x F x T - stft_mag = pad_and_partition(stft_mag, self.T) # B x 2 x F x T - stft_mag = stft_mag.transpose(2, 3) # B x 2 x T x F - - # compute instruments' mask - masks = [] - for net in self.instruments: - mask = net(stft_mag) - masks.append(mask) - - # compute denominator - mask_sum = sum([m ** 2 for m in masks]) - mask_sum += 1e-10 - - wavs = [] - for mask in masks: - mask = (mask ** 2 + 1e-10/2)/(mask_sum) - mask = mask.transpose(2, 3) # B x 2 X F x T - - mask = torch.cat( - torch.split(mask, 1, dim=0), dim=3) - - mask = mask[:,0,:,:L].unsqueeze(-1) # 2 x F x L x 1 - stft_masked = stft * mask - wav_masked = self.inverse_stft(stft_masked) - - wavs.append(wav_masked) - - return wavs \ No newline at end of file diff --git a/codes/models/spleeter/separator.py b/codes/models/spleeter/separator.py deleted file mode 100644 index bda3e25e..00000000 --- a/codes/models/spleeter/separator.py +++ /dev/null @@ -1,32 +0,0 @@ -import torch -import torch.nn.functional as F - -from models.spleeter.estimator import Estimator - - -class Separator: - def __init__(self, model_path, input_sr=44100, device='cuda'): - self.model = Estimator(2, model_path).to(device) - self.device = device - self.input_sr = input_sr - - def separate(self, npwav, normalize=False): - if not isinstance(npwav, torch.Tensor): - assert len(npwav.shape) == 1 - wav = torch.tensor(npwav, device=self.device) - wav = wav.view(1,-1) - else: - assert len(npwav.shape) == 2 # Input should be BxL - wav = npwav.to(self.device) - - if normalize: - wav = wav / (wav.max() + 1e-8) - - # Spleeter expects audio input to be 44.1kHz. - wav = F.interpolate(wav.unsqueeze(1), mode='nearest', scale_factor=44100/self.input_sr).squeeze(1) - res = self.model.separate(wav) - res = [F.interpolate(r.unsqueeze(1), mode='nearest', scale_factor=self.input_sr/44100)[:,0] for r in res] - return { - 'vocals': res[0].cpu().numpy(), - 'accompaniment': res[1].cpu().numpy() - } diff --git a/codes/models/spleeter/unet.py b/codes/models/spleeter/unet.py deleted file mode 100644 index 21c36805..00000000 --- a/codes/models/spleeter/unet.py +++ /dev/null @@ -1,80 +0,0 @@ -import torch -from torch import nn - - -def down_block(in_filters, out_filters): - return nn.Conv2d(in_filters, out_filters, kernel_size=5, - stride=2, padding=2, - ), nn.Sequential( - nn.BatchNorm2d(out_filters, track_running_stats=True, eps=1e-3, momentum=0.01), - nn.LeakyReLU(0.2) - ) - - -def up_block(in_filters, out_filters, dropout=False): - layers = [ - nn.ConvTranspose2d(in_filters, out_filters, kernel_size=5, - stride=2, padding=2, output_padding=1 - ), - nn.ReLU(), - nn.BatchNorm2d(out_filters, track_running_stats=True, eps=1e-3, momentum=0.01) - ] - if dropout: - layers.append(nn.Dropout(0.5)) - - return nn.Sequential(*layers) - - -class UNet(nn.Module): - def __init__(self, in_channels=2): - super(UNet, self).__init__() - self.down1_conv, self.down1_act = down_block(in_channels, 16) - self.down2_conv, self.down2_act = down_block(16, 32) - self.down3_conv, self.down3_act = down_block(32, 64) - self.down4_conv, self.down4_act = down_block(64, 128) - self.down5_conv, self.down5_act = down_block(128, 256) - self.down6_conv, self.down6_act = down_block(256, 512) - - self.up1 = up_block(512, 256, dropout=True) - self.up2 = up_block(512, 128, dropout=True) - self.up3 = up_block(256, 64, dropout=True) - self.up4 = up_block(128, 32) - self.up5 = up_block(64, 16) - self.up6 = up_block(32, 1) - self.up7 = nn.Sequential( - nn.Conv2d(1, 2, kernel_size=4, dilation=2, padding=3), - nn.Sigmoid() - ) - - def forward(self, x): - d1_conv = self.down1_conv(x) - d1 = self.down1_act(d1_conv) - - d2_conv = self.down2_conv(d1) - d2 = self.down2_act(d2_conv) - - d3_conv = self.down3_conv(d2) - d3 = self.down3_act(d3_conv) - - d4_conv = self.down4_conv(d3) - d4 = self.down4_act(d4_conv) - - d5_conv = self.down5_conv(d4) - d5 = self.down5_act(d5_conv) - - d6_conv = self.down6_conv(d5) - d6 = self.down6_act(d6_conv) - - u1 = self.up1(d6) - u2 = self.up2(torch.cat([d5_conv, u1], axis=1)) - u3 = self.up3(torch.cat([d4_conv, u2], axis=1)) - u4 = self.up4(torch.cat([d3_conv, u3], axis=1)) - u5 = self.up5(torch.cat([d2_conv, u4], axis=1)) - u6 = self.up6(torch.cat([d1_conv, u5], axis=1)) - u7 = self.up7(u6) - return u7 * x - - -if __name__ == '__main__': - net = UNet(14) - print(net(torch.rand(1, 14, 20, 48)).shape) \ No newline at end of file diff --git a/codes/models/spleeter/util.py b/codes/models/spleeter/util.py deleted file mode 100644 index 036f5a74..00000000 --- a/codes/models/spleeter/util.py +++ /dev/null @@ -1,91 +0,0 @@ -import numpy as np -import tensorflow as tf - -from .unet import UNet - - -def tf2pytorch(checkpoint_path, num_instrumments): - tf_vars = {} - init_vars = tf.train.list_variables(checkpoint_path) - # print(init_vars) - for name, shape in init_vars: - try: - # print('Loading TF Weight {} with shape {}'.format(name, shape)) - data = tf.train.load_variable(checkpoint_path, name) - tf_vars[name] = data - except Exception as e: - print('Load error') - conv_idx = 0 - tconv_idx = 0 - bn_idx = 0 - outputs = [] - for i in range(num_instrumments): - output = {} - outputs.append(output) - - for j in range(1,7): - if conv_idx == 0: - conv_suffix = "" - else: - conv_suffix = "_" + str(conv_idx) - - if bn_idx == 0: - bn_suffix = "" - else: - bn_suffix = "_" + str(bn_idx) - - output['down{}_conv.weight'.format(j)] = np.transpose( - tf_vars["conv2d{}/kernel".format(conv_suffix)], (3, 2, 0, 1)) - # print('conv dtype: ',output['down{}.0.weight'.format(j)].dtype) - output['down{}_conv.bias'.format( - j)] = tf_vars["conv2d{}/bias".format(conv_suffix)] - - output['down{}_act.0.weight'.format( - j)] = tf_vars["batch_normalization{}/gamma".format(bn_suffix)] - output['down{}_act.0.bias'.format( - j)] = tf_vars["batch_normalization{}/beta".format(bn_suffix)] - output['down{}_act.0.running_mean'.format( - j)] = tf_vars['batch_normalization{}/moving_mean'.format(bn_suffix)] - output['down{}_act.0.running_var'.format( - j)] = tf_vars['batch_normalization{}/moving_variance'.format(bn_suffix)] - - conv_idx += 1 - bn_idx += 1 - - # up blocks - for j in range(1, 7): - if tconv_idx == 0: - tconv_suffix = "" - else: - tconv_suffix = "_" + str(tconv_idx) - - if bn_idx == 0: - bn_suffix = "" - else: - bn_suffix= "_" + str(bn_idx) - - output['up{}.0.weight'.format(j)] = np.transpose( - tf_vars["conv2d_transpose{}/kernel".format(tconv_suffix)], (3,2,0, 1)) - output['up{}.0.bias'.format( - j)] = tf_vars["conv2d_transpose{}/bias".format(tconv_suffix)] - output['up{}.2.weight'.format( - j)] = tf_vars["batch_normalization{}/gamma".format(bn_suffix)] - output['up{}.2.bias'.format( - j)] = tf_vars["batch_normalization{}/beta".format(bn_suffix)] - output['up{}.2.running_mean'.format( - j)] = tf_vars['batch_normalization{}/moving_mean'.format(bn_suffix)] - output['up{}.2.running_var'.format( - j)] = tf_vars['batch_normalization{}/moving_variance'.format(bn_suffix)] - tconv_idx += 1 - bn_idx += 1 - - if conv_idx == 0: - suffix = "" - else: - suffix = "_" + str(conv_idx) - output['up7.0.weight'] = np.transpose( - tf_vars['conv2d{}/kernel'.format(suffix)], (3, 2, 0, 1)) - output['up7.0.bias'] = tf_vars['conv2d{}/bias'.format(suffix)] - conv_idx += 1 - - return outputs \ No newline at end of file diff --git a/codes/scripts/audio/test_audio_segmentor.py b/codes/scripts/audio/test_audio_segmentor.py index 62c70897..2aca9edd 100644 --- a/codes/scripts/audio/test_audio_segmentor.py +++ b/codes/scripts/audio/test_audio_segmentor.py @@ -4,20 +4,17 @@ import random import argparse import audio2numpy -import torchvision from munch import munchify import utils import utils.options as option import utils.util as util from data.audio.nv_tacotron_dataset import save_mel_buffer_to_file -from models.tacotron2 import hparams -from models.tacotron2.layers import TacotronSTFT -from models.tacotron2.text import sequence_to_text +from models.audio.tts.tacotron2 import hparams +from models.audio.tts.tacotron2 import TacotronSTFT +from models.audio.tts.tacotron2 import sequence_to_text from scripts.audio.use_vocoder import Vocoder from trainer.ExtensibleTrainer import ExtensibleTrainer -from data import create_dataset, create_dataloader -from tqdm import tqdm import torch import numpy as np from scipy.io import wavfile diff --git a/codes/scripts/audio/test_audio_similarity.py b/codes/scripts/audio/test_audio_similarity.py index 297fb4e4..8c48a0e6 100644 --- a/codes/scripts/audio/test_audio_similarity.py +++ b/codes/scripts/audio/test_audio_similarity.py @@ -1,12 +1,11 @@ import os import torch -import torch.nn as nn import torch.nn.functional as F from data.util import is_wav_file, find_files_of_type -from models.audio_resnet import resnet34, resnet50 -from models.tacotron2.taco_utils import load_wav_to_torch +from models.audio_resnet import resnet50 +from models.audio.tts.tacotron2 import load_wav_to_torch from scripts.byol.byol_extract_wrapped_model import extract_byol_model_from_state_dict if __name__ == '__main__': diff --git a/codes/scripts/audio/test_audio_speech_recognition.py b/codes/scripts/audio/test_audio_speech_recognition.py index 2de85809..3eb28c30 100644 --- a/codes/scripts/audio/test_audio_speech_recognition.py +++ b/codes/scripts/audio/test_audio_speech_recognition.py @@ -3,12 +3,10 @@ import logging import random import argparse -import torchvision - import utils import utils.options as option import utils.util as util -from models.tacotron2.text import sequence_to_text +from models.audio.tts.tacotron2 import sequence_to_text from trainer.ExtensibleTrainer import ExtensibleTrainer from data import create_dataset, create_dataloader from tqdm import tqdm diff --git a/codes/scripts/diffusion/diffusion_noise_surfer.py b/codes/scripts/diffusion/diffusion_noise_surfer.py index a4ce40f6..fb1aab28 100644 --- a/codes/scripts/diffusion/diffusion_noise_surfer.py +++ b/codes/scripts/diffusion/diffusion_noise_surfer.py @@ -2,11 +2,8 @@ import os import os.path as osp import logging import random -import time import argparse -from collections import OrderedDict -import numpy from PIL import Image from scipy.io import wavfile from torchvision.transforms import ToTensor @@ -15,10 +12,7 @@ import utils import utils.options as option import utils.util as util from data.audio.unsupervised_audio_dataset import load_audio -from models.tacotron2.taco_utils import load_wav_to_torch from trainer.ExtensibleTrainer import ExtensibleTrainer -from data import create_dataset, create_dataloader -from tqdm import tqdm import torch import numpy as np diff --git a/codes/scripts/hugging_face_hub_upload.py b/codes/scripts/hugging_face_hub_upload.py index cadcce23..a0091899 100644 --- a/codes/scripts/hugging_face_hub_upload.py +++ b/codes/scripts/hugging_face_hub_upload.py @@ -1,10 +1,3 @@ -import json - -import torch - -from models.asr.w2v_wrapper import Wav2VecWrapper -from models.tacotron2.text import tacotron_symbol_mapping - if __name__ == '__main__': """ Utility script for uploading model weights to the HF hub diff --git a/codes/trainer/eval/audio_diffusion_fid.py b/codes/trainer/eval/audio_diffusion_fid.py index 0cd3c97e..2a663124 100644 --- a/codes/trainer/eval/audio_diffusion_fid.py +++ b/codes/trainer/eval/audio_diffusion_fid.py @@ -2,12 +2,10 @@ import os import os.path as osp import torch import torchaudio -import torchvision -from pytorch_fid import fid_score from pytorch_fid.fid_score import calculate_frechet_distance from torch import distributed from tqdm import tqdm -from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor +from transformers import Wav2Vec2ForCTC import torch.nn.functional as F import numpy as np @@ -15,7 +13,7 @@ import trainer.eval.evaluator as evaluator from data.audio.paired_voice_audio_dataset import load_tsv_aligned_codes from data.audio.unsupervised_audio_dataset import load_audio from models.clip.mel_text_clip import MelTextCLIP -from models.tacotron2.text import sequence_to_text, text_to_sequence +from models.audio.tts.tacotron2 import text_to_sequence from scripts.audio.gen.speech_synthesis_utils import load_discrete_vocoder_diffuser, wav_to_mel, load_speech_dvae, \ convert_mel_to_codes from utils.util import ceil_multiple, opt_get @@ -116,6 +114,27 @@ class AudioDiffusionFid(evaluator.Evaluator): 'unaligned_input': torch.tensor(text_codes, device=audio.device).unsqueeze(0)}) return gen, real_resampled, sample_rate + + def perform_diffusion_tts9_from_codes(self, audio, codes, text, sample_rate=5500): + mel = wav_to_mel(audio) + mel_codes = convert_mel_to_codes(self.dvae, mel) + text_codes = text_to_sequence(text) + real_resampled = torchaudio.functional.resample(audio, 22050, sample_rate).unsqueeze(0) + + output_size = real_resampled.shape[-1] + aligned_codes_compression_factor = output_size // mel_codes.shape[-1] + padded_size = ceil_multiple(output_size, 2048) + padding_added = padded_size - output_size + padding_needed_for_codes = padding_added // aligned_codes_compression_factor + if padding_needed_for_codes > 0: + mel_codes = F.pad(mel_codes, (0, padding_needed_for_codes)) + output_shape = (1, 1, padded_size) + gen = self.diffuser.p_sample_loop(self.model, output_shape, + model_kwargs={'tokens': mel_codes, + 'conditioning_input': audio.unsqueeze(0), + 'unaligned_input': torch.tensor(text_codes, device=audio.device).unsqueeze(0)}) + return gen, real_resampled, sample_rate + def load_projector(self): """ Builds the CLIP model used to project speech into a latent. This model has fixed parameters and a fixed loading diff --git a/codes/trainer/eval/eval_wer.py b/codes/trainer/eval/eval_wer.py index 8b027139..0c6f5f2f 100644 --- a/codes/trainer/eval/eval_wer.py +++ b/codes/trainer/eval/eval_wer.py @@ -4,12 +4,12 @@ from datasets import load_metric import torch from tqdm import tqdm -from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC +from transformers import Wav2Vec2Processor import trainer.eval.evaluator as evaluator from data import create_dataset, create_dataloader -from models.asr.w2v_wrapper import only_letters, Wav2VecWrapper -from models.tacotron2.text import sequence_to_text, tacotron_symbols +from models.audio.asr.w2v_wrapper import only_letters, Wav2VecWrapper +from models.audio.tts.tacotron2 import sequence_to_text, tacotron_symbols from pyctcdecode import build_ctcdecoder # Librispeech: diff --git a/codes/trainer/eval/mel_evaluator.py b/codes/trainer/eval/mel_evaluator.py index 0044c0bf..3a89b3e2 100644 --- a/codes/trainer/eval/mel_evaluator.py +++ b/codes/trainer/eval/mel_evaluator.py @@ -4,7 +4,7 @@ import trainer.eval.evaluator as evaluator from data import create_dataset from data.audio.nv_tacotron_dataset import TextMelCollate -from models.tacotron2.loss import Tacotron2LossRaw +from models.audio.tts.tacotron2 import Tacotron2LossRaw from torch.utils.data import DataLoader from tqdm import tqdm diff --git a/codes/trainer/injectors/audio_injectors.py b/codes/trainer/injectors/audio_injectors.py index 1f799113..694981d0 100644 --- a/codes/trainer/injectors/audio_injectors.py +++ b/codes/trainer/injectors/audio_injectors.py @@ -1,7 +1,6 @@ import random import torch -import torch.nn as nn import torch.nn.functional as F import torchaudio @@ -12,7 +11,7 @@ from utils.util import opt_get, load_model_from_config class MelSpectrogramInjector(Injector): def __init__(self, opt, env): super().__init__(opt, env) - from models.tacotron2.layers import TacotronSTFT + from models.audio.tts.tacotron2 import TacotronSTFT # These are the default tacotron values for the MEL spectrogram. filter_length = opt_get(opt, ['filter_length'], 1024) hop_length = opt_get(opt, ['hop_length'], 256) diff --git a/codes/trainer/injectors/spec_augment.py b/codes/trainer/injectors/spec_augment.py index 6a277216..3f5fc206 100644 --- a/codes/trainer/injectors/spec_augment.py +++ b/codes/trainer/injectors/spec_augment.py @@ -83,7 +83,7 @@ class CombineMelInjector(Injector): self.text_lengths = opt['text_lengths_key'] self.output_audio_key = opt['output_audio_key'] self.output_text_key = opt['output_text_key'] - from models.tacotron2.text import symbols + from models.audio.tts.tacotron2 import symbols self.text_separator = len(symbols)+1 # Probably need to allow this to be set by user. def forward(self, state): diff --git a/codes/trainer/losses.py b/codes/trainer/losses.py index 6ea27b64..0352fc52 100644 --- a/codes/trainer/losses.py +++ b/codes/trainer/losses.py @@ -59,7 +59,7 @@ def create_loss(opt_loss, env): from models.switched_conv.mixture_of_experts import SwitchTransformersLoadBalancingLoss return SwitchTransformersLoadBalancingLoss(opt_loss, env) elif type == 'nv_tacotron2_loss': - from models.tacotron2.loss import Tacotron2Loss + from models.audio.tts.tacotron2 import Tacotron2Loss return Tacotron2Loss(opt_loss, env) else: raise NotImplementedError