diff --git a/eval_multiple.py b/eval_multiple.py deleted file mode 100644 index 9defa52..0000000 --- a/eval_multiple.py +++ /dev/null @@ -1,38 +0,0 @@ -import os - -import torchaudio - -from api import TextToSpeech -from utils.audio import load_audio - -if __name__ == '__main__': - fname = 'Y:\\clips\\books2\\subset512-oco.tsv' - stop_after = 128 - outpath_base = 'D:\\tmp\\tortoise-tts-eval\\audiobooks' - outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real' - - os.makedirs(outpath_real, exist_ok=True) - with open(fname, 'r', encoding='utf-8') as f: - lines = [l.strip().split('\t') for l in f.readlines()] - - tts = TextToSpeech() - for k in range(3): - outpath = f'{outpath_base}_{k}' - os.makedirs(outpath, exist_ok=True) - recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8') - for e, line in enumerate(lines): - if e >= stop_after: - break - transcript = line[0] - path = os.path.join(os.path.dirname(fname), line[1]) - cond_audio = load_audio(path, 22050) - torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050) - sample = tts.tts_with_preset(transcript, [cond_audio, cond_audio], preset='standard') - - down = torchaudio.functional.resample(sample, 24000, 22050) - fout_path = os.path.join(outpath, os.path.basename(line[1])) - torchaudio.save(fout_path, down.squeeze(0), 22050) - - recorder.write(f'{transcript}\t{fout_path}\n') - recorder.flush() - recorder.close() \ No newline at end of file diff --git a/models/cvvp.py b/models/cvvp.py deleted file mode 100644 index 0c9fd35..0000000 --- a/models/cvvp.py +++ /dev/null @@ -1,133 +0,0 @@ -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import einsum -from torch.utils.checkpoint import checkpoint - -from models.arch_util import AttentionBlock -from models.xtransformers import ContinuousTransformerWrapper, Encoder - - -def exists(val): - return val is not None - - -def masked_mean(t, mask): - t = t.masked_fill(~mask, 0.) - return t.sum(dim = 1) / mask.sum(dim = 1) - - -class CollapsingTransformer(nn.Module): - def __init__(self, model_dim, output_dims, heads, dropout, depth, mask_percentage=0, **encoder_kwargs): - super().__init__() - self.transformer = ContinuousTransformerWrapper( - max_seq_len=-1, - use_pos_emb=False, - attn_layers=Encoder( - dim=model_dim, - depth=depth, - heads=heads, - ff_dropout=dropout, - ff_mult=1, - attn_dropout=dropout, - use_rmsnorm=True, - ff_glu=True, - rotary_pos_emb=True, - **encoder_kwargs, - )) - self.pre_combiner = nn.Sequential(nn.Conv1d(model_dim, output_dims, 1), - AttentionBlock(output_dims, num_heads=heads, do_checkpoint=False), - nn.Conv1d(output_dims, output_dims, 1)) - self.mask_percentage = mask_percentage - - def forward(self, x, **transformer_kwargs): - h = self.transformer(x, **transformer_kwargs) - h = h.permute(0,2,1) - h = checkpoint(self.pre_combiner, h).permute(0,2,1) - if self.training: - mask = torch.rand_like(h.float()) > self.mask_percentage - else: - mask = torch.ones_like(h.float()).bool() - return masked_mean(h, mask) - - -class ConvFormatEmbedding(nn.Module): - def __init__(self, *args, **kwargs): - super().__init__() - self.emb = nn.Embedding(*args, **kwargs) - - def forward(self, x): - y = self.emb(x) - return y.permute(0,2,1) - - -class CVVP(nn.Module): - def __init__( - self, - model_dim=512, - transformer_heads=8, - dropout=.1, - conditioning_enc_depth=8, - cond_mask_percentage=0, - mel_channels=80, - mel_codes=None, - speech_enc_depth=8, - speech_mask_percentage=0, - latent_multiplier=1, - ): - super().__init__() - latent_dim = latent_multiplier*model_dim - self.temperature = nn.Parameter(torch.tensor(1.)) - - self.cond_emb = nn.Sequential(nn.Conv1d(mel_channels, model_dim//2, kernel_size=5, stride=2, padding=2), - nn.Conv1d(model_dim//2, model_dim, kernel_size=3, stride=2, padding=1)) - self.conditioning_transformer = CollapsingTransformer(model_dim, model_dim, transformer_heads, dropout, conditioning_enc_depth, cond_mask_percentage) - self.to_conditioning_latent = nn.Linear(latent_dim, latent_dim, bias=False) - - if mel_codes is None: - self.speech_emb = nn.Conv1d(mel_channels, model_dim, kernel_size=5, padding=2) - else: - self.speech_emb = ConvFormatEmbedding(mel_codes, model_dim) - self.speech_transformer = CollapsingTransformer(model_dim, latent_dim, transformer_heads, dropout, speech_enc_depth, speech_mask_percentage) - self.to_speech_latent = nn.Linear(latent_dim, latent_dim, bias=False) - - def get_grad_norm_parameter_groups(self): - return { - 'conditioning': list(self.conditioning_transformer.parameters()), - 'speech': list(self.speech_transformer.parameters()), - } - - def forward( - self, - mel_cond, - mel_input, - return_loss=False - ): - cond_emb = self.cond_emb(mel_cond).permute(0,2,1) - enc_cond = self.conditioning_transformer(cond_emb) - cond_latents = self.to_conditioning_latent(enc_cond) - - speech_emb = self.speech_emb(mel_input).permute(0,2,1) - enc_speech = self.speech_transformer(speech_emb) - speech_latents = self.to_speech_latent(enc_speech) - - - cond_latents, speech_latents = map(lambda t: F.normalize(t, p=2, dim=-1), (cond_latents, speech_latents)) - temp = self.temperature.exp() - - if not return_loss: - sim = einsum('n d, n d -> n', cond_latents, speech_latents) * temp - return sim - - sim = einsum('i d, j d -> i j', cond_latents, speech_latents) * temp - labels = torch.arange(cond_latents.shape[0], device=mel_input.device) - loss = (F.cross_entropy(sim, labels) + F.cross_entropy(sim.t(), labels)) / 2 - - return loss - - -if __name__ == '__main__': - clvp = CVVP() - clvp(torch.randn(2,80,100), - torch.randn(2,80,95), - return_loss=True) \ No newline at end of file diff --git a/sweep.py b/sweep.py deleted file mode 100644 index bc72fec..0000000 --- a/sweep.py +++ /dev/null @@ -1,65 +0,0 @@ -import os -from random import shuffle - -import torchaudio - -from api import TextToSpeech -from utils.audio import load_audio - - -def permutations(args): - res = [] - k = next(iter(args.keys())) - vals = args[k] - del args[k] - if not args: - return [{k: v} for v in vals] - lower = permutations(args) - for v in vals: - for l in lower: - lc = l.copy() - lc[k] = v - res.append(lc) - return res - - -if __name__ == '__main__': - fname = 'Y:\\clips\\books2\\subset512-oco.tsv' - stop_after = 512 - outpath_base = 'D:\\tmp\\tortoise-tts-eval\\sweep-2' - outpath_real = 'D:\\tmp\\tortoise-tts-eval\\real' - - arg_ranges = { - 'top_p': [.8,1], - 'temperature': [.8,.9,1], - 'diffusion_temperature': [.8,1], - 'cond_free_k': [1,2,5,10], - } - cfgs = permutations(arg_ranges) - shuffle(cfgs) - - for cfg in cfgs: - cfg_desc = '_'.join([f'{k}-{v}' for k,v in cfg.items()]) - outpath = os.path.join(outpath_base, f'{cfg_desc}') - os.makedirs(outpath, exist_ok=True) - os.makedirs(outpath_real, exist_ok=True) - with open(fname, 'r', encoding='utf-8') as f: - lines = [l.strip().split('\t') for l in f.readlines()] - - recorder = open(os.path.join(outpath, 'transcript.tsv'), 'w', encoding='utf-8') - tts = TextToSpeech() - for e, line in enumerate(lines): - if e >= stop_after: - break - transcript = line[0] - path = os.path.join(os.path.dirname(fname), line[1]) - cond_audio = load_audio(path, 22050) - torchaudio.save(os.path.join(outpath_real, os.path.basename(line[1])), cond_audio, 22050) - sample = tts.tts(transcript, [cond_audio, cond_audio], num_autoregressive_samples=32, repetition_penalty=2.0, - k=1, diffusion_iterations=32, length_penalty=1.0, **cfg) - down = torchaudio.functional.resample(sample, 24000, 22050) - fout_path = os.path.join(outpath, os.path.basename(line[1])) - torchaudio.save(fout_path, down.squeeze(0), 22050) - recorder.write(f'{transcript}\t{fout_path}\n') - recorder.flush() - recorder.close() \ No newline at end of file diff --git a/utils/__init__.py b/tortoise/__init__.py similarity index 100% rename from utils/__init__.py rename to tortoise/__init__.py diff --git a/api.py b/tortoise/api.py similarity index 97% rename from api.py rename to tortoise/api.py index 92e82be..8377b35 100644 --- a/api.py +++ b/tortoise/api.py @@ -1,4 +1,3 @@ -import argparse import os import random from urllib import request @@ -8,19 +7,18 @@ import torch.nn.functional as F import progressbar import torchaudio -from models.classifier import AudioMiniEncoderWithClassifierHead -from models.cvvp import CVVP -from models.diffusion_decoder import DiffusionTts -from models.autoregressive import UnifiedVoice +from tortoise.models.classifier import AudioMiniEncoderWithClassifierHead +from tortoise.models.cvvp import CVVP +from tortoise.models.diffusion_decoder import DiffusionTts +from tortoise.models.autoregressive import UnifiedVoice from tqdm import tqdm -from models.arch_util import TorchMelSpectrogram -from models.clvp import CLVP -from models.vocoder import UnivNetGenerator -from utils.audio import load_audio, wav_to_univnet_mel, denormalize_tacotron_mel -from utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule -from utils.tokenizer import VoiceBpeTokenizer, lev_distance - +from tortoise.models.arch_util import TorchMelSpectrogram +from tortoise.models.clvp import CLVP +from tortoise.models.vocoder import UnivNetGenerator +from tortoise.utils.audio import wav_to_univnet_mel, denormalize_tacotron_mel +from tortoise.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule +from tortoise.utils.tokenizer import VoiceBpeTokenizer pbar = None diff --git a/do_tts.py b/tortoise/do_tts.py similarity index 96% rename from do_tts.py rename to tortoise/do_tts.py index fa0347e..12e1356 100644 --- a/do_tts.py +++ b/tortoise/do_tts.py @@ -4,7 +4,7 @@ import os import torchaudio from api import TextToSpeech -from utils.audio import load_audio, get_voices +from tortoise.utils.audio import load_audio, get_voices if __name__ == '__main__': parser = argparse.ArgumentParser() diff --git a/is_this_from_tortoise.py b/tortoise/is_this_from_tortoise.py similarity index 91% rename from is_this_from_tortoise.py rename to tortoise/is_this_from_tortoise.py index 550b33e..4bd6dbe 100644 --- a/is_this_from_tortoise.py +++ b/tortoise/is_this_from_tortoise.py @@ -1,7 +1,7 @@ import argparse from api import classify_audio_clip -from utils.audio import load_audio +from tortoise.utils.audio import load_audio if __name__ == '__main__': parser = argparse.ArgumentParser() diff --git a/models/arch_util.py b/tortoise/models/arch_util.py similarity index 99% rename from models/arch_util.py rename to tortoise/models/arch_util.py index 832315c..3a004af 100644 --- a/models/arch_util.py +++ b/tortoise/models/arch_util.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import torch.nn.functional as F import torchaudio -from models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias +from tortoise.models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias def zero_module(module): diff --git a/models/autoregressive.py b/tortoise/models/autoregressive.py similarity index 99% rename from models/autoregressive.py rename to tortoise/models/autoregressive.py index 6a91748..28ed39b 100644 --- a/models/autoregressive.py +++ b/tortoise/models/autoregressive.py @@ -6,8 +6,8 @@ import torch.nn.functional as F from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions from transformers.utils.model_parallel_utils import get_device_map, assert_device_map -from models.arch_util import AttentionBlock -from utils.typical_sampling import TypicalLogitsWarper +from tortoise.models.arch_util import AttentionBlock +from tortoise.utils.typical_sampling import TypicalLogitsWarper def null_position_embeddings(range, dim): diff --git a/models/classifier.py b/tortoise/models/classifier.py similarity index 97% rename from models/classifier.py rename to tortoise/models/classifier.py index c899773..ce574ea 100644 --- a/models/classifier.py +++ b/tortoise/models/classifier.py @@ -1,9 +1,8 @@ import torch import torch.nn as nn -import torch.nn.functional as F from torch.utils.checkpoint import checkpoint -from models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock +from tortoise.models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock class ResBlock(nn.Module): diff --git a/models/clvp.py b/tortoise/models/clvp.py similarity index 97% rename from models/clvp.py rename to tortoise/models/clvp.py index 1eec06a..00f5011 100644 --- a/models/clvp.py +++ b/tortoise/models/clvp.py @@ -3,9 +3,9 @@ import torch.nn as nn import torch.nn.functional as F from torch import einsum -from models.arch_util import CheckpointedXTransformerEncoder -from models.transformer import Transformer -from models.xtransformers import Encoder +from tortoise.models.arch_util import CheckpointedXTransformerEncoder +from tortoise.models.transformer import Transformer +from tortoise.models.xtransformers import Encoder def exists(val): diff --git a/models/diffusion_decoder.py b/tortoise/models/diffusion_decoder.py similarity index 99% rename from models/diffusion_decoder.py rename to tortoise/models/diffusion_decoder.py index 5fdf7ad..b779324 100644 --- a/models/diffusion_decoder.py +++ b/tortoise/models/diffusion_decoder.py @@ -7,7 +7,7 @@ import torch.nn as nn import torch.nn.functional as F from torch import autocast -from models.arch_util import normalization, AttentionBlock +from tortoise.models.arch_util import normalization, AttentionBlock def is_latent(t): diff --git a/models/transformer.py b/tortoise/models/transformer.py similarity index 100% rename from models/transformer.py rename to tortoise/models/transformer.py diff --git a/models/vocoder.py b/tortoise/models/vocoder.py similarity index 100% rename from models/vocoder.py rename to tortoise/models/vocoder.py diff --git a/models/xtransformers.py b/tortoise/models/xtransformers.py similarity index 100% rename from models/xtransformers.py rename to tortoise/models/xtransformers.py diff --git a/read.py b/tortoise/read.py similarity index 95% rename from read.py rename to tortoise/read.py index 9e4e04c..bd9ea11 100644 --- a/read.py +++ b/tortoise/read.py @@ -2,12 +2,10 @@ import argparse import os import torch -import torch.nn.functional as F import torchaudio -from api import TextToSpeech, format_conditioning -from utils.audio import load_audio, get_voices -from utils.tokenizer import VoiceBpeTokenizer +from api import TextToSpeech +from tortoise.utils.audio import load_audio, get_voices def split_and_recombine_text(texts, desired_length=200, max_len=300): diff --git a/samples_generator.py b/tortoise/samples_generator.py similarity index 95% rename from samples_generator.py rename to tortoise/samples_generator.py index e2b36d3..937ba2f 100644 --- a/samples_generator.py +++ b/tortoise/samples_generator.py @@ -4,7 +4,7 @@ import os if __name__ == '__main__': result = "These words were never spoken.

Handpicked results

" - for fv in os.listdir('results/favorites'): + for fv in os.listdir('../results/favorites'): url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/favorites/{fv}' result = result + f'
\n' @@ -30,7 +30,7 @@ if __name__ == '__main__': line = line + f'' line = line + "" lines.append(line) - for txt in os.listdir('results/various/'): + for txt in os.listdir('../results/various/'): if 'desktop' in txt: continue line = f'{txt}' @@ -42,7 +42,7 @@ if __name__ == '__main__': result = result + '\n'.join(lines) + "" result = result + "

Longform result for all voices:

" - for lf in os.listdir('results/riding_hood'): + for lf in os.listdir('../results/riding_hood'): url = f'https://github.com/neonbjb/tortoise-tts/raw/main/results/riding_hood/{lf}' result = result + f'
\n' diff --git a/tortoise/utils/__init__.py b/tortoise/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/audio.py b/tortoise/utils/audio.py similarity index 99% rename from utils/audio.py rename to tortoise/utils/audio.py index cb86566..3fe558a 100644 --- a/utils/audio.py +++ b/tortoise/utils/audio.py @@ -6,7 +6,7 @@ import torchaudio import numpy as np from scipy.io.wavfile import read -from utils.stft import STFT +from tortoise.utils.stft import STFT def load_wav_to_torch(full_path): diff --git a/utils/diffusion.py b/tortoise/utils/diffusion.py similarity index 100% rename from utils/diffusion.py rename to tortoise/utils/diffusion.py diff --git a/utils/stft.py b/tortoise/utils/stft.py similarity index 100% rename from utils/stft.py rename to tortoise/utils/stft.py diff --git a/utils/tokenizer.py b/tortoise/utils/tokenizer.py similarity index 100% rename from utils/tokenizer.py rename to tortoise/utils/tokenizer.py diff --git a/utils/typical_sampling.py b/tortoise/utils/typical_sampling.py similarity index 100% rename from utils/typical_sampling.py rename to tortoise/utils/typical_sampling.py