diff --git a/requirements.txt b/requirements.txt index b971e61..91503ea 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,6 @@ inflect progressbar einops unidecode -entmax \ No newline at end of file +entmax +scipy +librosa \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..3162754 --- /dev/null +++ b/setup.py @@ -0,0 +1,22 @@ +from setuptools import setup, find_packages + +install_requires = [ + "torch", + "torchaudio", + "rotary_embedding_torch", + "transformers", + "tokenizers", + "inflect", + "progressbar", + "einops", + "unidecode", + "entmax", + "scipy", + "librosa" +] + +setup( + name="tortoise_tts", + packages=['tortoise_tts'], + install_requires=install_requires, +) \ No newline at end of file diff --git a/tortoise_tts/__init__.py b/tortoise_tts/__init__.py new file mode 100644 index 0000000..c9cc7ce --- /dev/null +++ b/tortoise_tts/__init__.py @@ -0,0 +1 @@ +from .api import TextToSpeech diff --git a/api.py b/tortoise_tts/api.py similarity index 96% rename from api.py rename to tortoise_tts/api.py index 6aa94cf..aa1f358 100644 --- a/api.py +++ b/tortoise_tts/api.py @@ -8,18 +8,18 @@ import torch.nn.functional as F import progressbar import torchaudio -from models.classifier import AudioMiniEncoderWithClassifierHead -from models.cvvp import CVVP -from models.diffusion_decoder import DiffusionTts -from models.autoregressive import UnifiedVoice +from tortoise_tts.models.classifier import AudioMiniEncoderWithClassifierHead +from tortoise_tts.models.cvvp import CVVP +from tortoise_tts.models.diffusion_decoder import DiffusionTts +from tortoise_tts.models.autoregressive import UnifiedVoice from tqdm import tqdm -from models.arch_util import TorchMelSpectrogram -from models.clvp import CLVP -from models.vocoder import UnivNetGenerator -from utils.audio import load_audio, wav_to_univnet_mel, denormalize_tacotron_mel -from utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule -from utils.tokenizer import VoiceBpeTokenizer, lev_distance +from tortoise_tts.models.arch_util import TorchMelSpectrogram +from tortoise_tts.models.clvp import CLVP +from tortoise_tts.models.vocoder import UnivNetGenerator +from tortoise_tts.utils.audio import load_audio, wav_to_univnet_mel, denormalize_tacotron_mel +from tortoise_tts.utils.diffusion import SpacedDiffusion, space_timesteps, get_named_beta_schedule +from tortoise_tts.utils.tokenizer import VoiceBpeTokenizer, lev_distance pbar = None diff --git a/utils/__init__.py b/tortoise_tts/data/__init__.py similarity index 100% rename from utils/__init__.py rename to tortoise_tts/data/__init__.py diff --git a/data/riding_hood.txt b/tortoise_tts/data/riding_hood.txt similarity index 100% rename from data/riding_hood.txt rename to tortoise_tts/data/riding_hood.txt diff --git a/data/tokenizer.json b/tortoise_tts/data/tokenizer.json similarity index 100% rename from data/tokenizer.json rename to tortoise_tts/data/tokenizer.json diff --git a/do_tts.py b/tortoise_tts/do_tts.py similarity index 100% rename from do_tts.py rename to tortoise_tts/do_tts.py diff --git a/eval_multiple.py b/tortoise_tts/eval_multiple.py similarity index 100% rename from eval_multiple.py rename to tortoise_tts/eval_multiple.py diff --git a/is_this_from_tortoise.py b/tortoise_tts/is_this_from_tortoise.py similarity index 100% rename from is_this_from_tortoise.py rename to tortoise_tts/is_this_from_tortoise.py diff --git a/tortoise_tts/models/__init__.py b/tortoise_tts/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/models/arch_util.py b/tortoise_tts/models/arch_util.py similarity index 99% rename from models/arch_util.py rename to tortoise_tts/models/arch_util.py index 832315c..3390153 100644 --- a/models/arch_util.py +++ b/tortoise_tts/models/arch_util.py @@ -5,7 +5,7 @@ import torch import torch.nn as nn import torch.nn.functional as F import torchaudio -from models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias +from tortoise_tts.models.xtransformers import ContinuousTransformerWrapper, RelativePositionBias def zero_module(module): diff --git a/models/autoregressive.py b/tortoise_tts/models/autoregressive.py similarity index 99% rename from models/autoregressive.py rename to tortoise_tts/models/autoregressive.py index 6a91748..aa2393a 100644 --- a/models/autoregressive.py +++ b/tortoise_tts/models/autoregressive.py @@ -6,8 +6,8 @@ import torch.nn.functional as F from transformers import GPT2Config, GPT2PreTrainedModel, LogitsProcessorList from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions from transformers.utils.model_parallel_utils import get_device_map, assert_device_map -from models.arch_util import AttentionBlock -from utils.typical_sampling import TypicalLogitsWarper +from tortoise_tts.models.arch_util import AttentionBlock +from tortoise_tts.utils.typical_sampling import TypicalLogitsWarper def null_position_embeddings(range, dim): diff --git a/models/classifier.py b/tortoise_tts/models/classifier.py similarity index 97% rename from models/classifier.py rename to tortoise_tts/models/classifier.py index c899773..abd0ec8 100644 --- a/models/classifier.py +++ b/tortoise_tts/models/classifier.py @@ -3,7 +3,7 @@ import torch.nn as nn import torch.nn.functional as F from torch.utils.checkpoint import checkpoint -from models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock +from tortoise_tts.models.arch_util import Upsample, Downsample, normalization, zero_module, AttentionBlock class ResBlock(nn.Module): diff --git a/models/clvp.py b/tortoise_tts/models/clvp.py similarity index 96% rename from models/clvp.py rename to tortoise_tts/models/clvp.py index 1eec06a..c054cc6 100644 --- a/models/clvp.py +++ b/tortoise_tts/models/clvp.py @@ -3,9 +3,9 @@ import torch.nn as nn import torch.nn.functional as F from torch import einsum -from models.arch_util import CheckpointedXTransformerEncoder -from models.transformer import Transformer -from models.xtransformers import Encoder +from tortoise_tts.models.arch_util import CheckpointedXTransformerEncoder +from tortoise_tts.models.transformer import Transformer +from tortoise_tts.models.xtransformers import Encoder def exists(val): diff --git a/models/cvvp.py b/tortoise_tts/models/cvvp.py similarity index 97% rename from models/cvvp.py rename to tortoise_tts/models/cvvp.py index 0c9fd35..fe441f9 100644 --- a/models/cvvp.py +++ b/tortoise_tts/models/cvvp.py @@ -4,8 +4,8 @@ import torch.nn.functional as F from torch import einsum from torch.utils.checkpoint import checkpoint -from models.arch_util import AttentionBlock -from models.xtransformers import ContinuousTransformerWrapper, Encoder +from tortoise_tts.models.arch_util import AttentionBlock +from tortoise_tts.models.xtransformers import ContinuousTransformerWrapper, Encoder def exists(val): diff --git a/models/diffusion_decoder.py b/tortoise_tts/models/diffusion_decoder.py similarity index 99% rename from models/diffusion_decoder.py rename to tortoise_tts/models/diffusion_decoder.py index 5fdf7ad..d72315f 100644 --- a/models/diffusion_decoder.py +++ b/tortoise_tts/models/diffusion_decoder.py @@ -7,7 +7,7 @@ import torch.nn as nn import torch.nn.functional as F from torch import autocast -from models.arch_util import normalization, AttentionBlock +from tortoise_tts.models.arch_util import normalization, AttentionBlock def is_latent(t): diff --git a/models/transformer.py b/tortoise_tts/models/transformer.py similarity index 100% rename from models/transformer.py rename to tortoise_tts/models/transformer.py diff --git a/models/vocoder.py b/tortoise_tts/models/vocoder.py similarity index 100% rename from models/vocoder.py rename to tortoise_tts/models/vocoder.py diff --git a/models/xtransformers.py b/tortoise_tts/models/xtransformers.py similarity index 100% rename from models/xtransformers.py rename to tortoise_tts/models/xtransformers.py diff --git a/read.py b/tortoise_tts/read.py similarity index 100% rename from read.py rename to tortoise_tts/read.py diff --git a/results/.gitattributes b/tortoise_tts/results/.gitattributes similarity index 100% rename from results/.gitattributes rename to tortoise_tts/results/.gitattributes diff --git a/tortoise_tts/results/__init__.py b/tortoise_tts/results/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/results/various/desktop.ini b/tortoise_tts/results/various/desktop.ini similarity index 100% rename from results/various/desktop.ini rename to tortoise_tts/results/various/desktop.ini diff --git a/samples_generator.py b/tortoise_tts/samples_generator.py similarity index 100% rename from samples_generator.py rename to tortoise_tts/samples_generator.py diff --git a/sweep.py b/tortoise_tts/sweep.py similarity index 100% rename from sweep.py rename to tortoise_tts/sweep.py diff --git a/tortoise_tts/utils/__init__.py b/tortoise_tts/utils/__init__.py new file mode 100644 index 0000000..fa1f97f --- /dev/null +++ b/tortoise_tts/utils/__init__.py @@ -0,0 +1,4 @@ +from .audio import ( + load_audio, + get_voices +) diff --git a/utils/audio.py b/tortoise_tts/utils/audio.py similarity index 99% rename from utils/audio.py rename to tortoise_tts/utils/audio.py index aad3a0f..e560c45 100644 --- a/utils/audio.py +++ b/tortoise_tts/utils/audio.py @@ -6,7 +6,7 @@ import torchaudio import numpy as np from scipy.io.wavfile import read -from utils.stft import STFT +from tortoise_tts.utils.stft import STFT def load_wav_to_torch(full_path): diff --git a/utils/diffusion.py b/tortoise_tts/utils/diffusion.py similarity index 100% rename from utils/diffusion.py rename to tortoise_tts/utils/diffusion.py diff --git a/utils/stft.py b/tortoise_tts/utils/stft.py similarity index 100% rename from utils/stft.py rename to tortoise_tts/utils/stft.py diff --git a/utils/tokenizer.py b/tortoise_tts/utils/tokenizer.py similarity index 97% rename from utils/tokenizer.py rename to tortoise_tts/utils/tokenizer.py index ed7e4cd..f018abd 100644 --- a/utils/tokenizer.py +++ b/tortoise_tts/utils/tokenizer.py @@ -3,6 +3,7 @@ import re import inflect import torch from tokenizers import Tokenizer +from pathlib import Path # Regular expression matching whitespace: @@ -165,6 +166,8 @@ def lev_distance(s1, s2): class VoiceBpeTokenizer: def __init__(self, vocab_file='data/tokenizer.json'): + vocab_file = str(Path(__file__).parent.parent / Path(vocab_file)) + print(vocab_file) if vocab_file is not None: self.tokenizer = Tokenizer.from_file(vocab_file) diff --git a/utils/typical_sampling.py b/tortoise_tts/utils/typical_sampling.py similarity index 100% rename from utils/typical_sampling.py rename to tortoise_tts/utils/typical_sampling.py diff --git a/tortoise_tts/voices/__init__.py b/tortoise_tts/voices/__init__.py new file mode 100644 index 0000000..e69de29