Compare commits
No commits in common. "master" and "master" have entirely different histories.
2
Dockerfile
Executable file → Normal file
2
Dockerfile
Executable file → Normal file
|
@ -20,7 +20,7 @@ ENV PATH="$HOME/miniconda/bin:$PATH"
|
|||
RUN conda init
|
||||
RUN conda install python=$PYTHON_VERSION
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
RUN mkdir $HOME/ai-voice-cloning
|
||||
WORKDIR $HOME/ai-voice-cloning
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# AI Voice Cloning
|
||||
|
||||
> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
|
||||
> **Note** This project has been in dire need of being rewritten from the ground up for some time. Apologies for any crust from my rather spaghetti code.
|
||||
|
||||
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
||||
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
|
||||
Subproject commit b10c58436d6871c26485d30b203e6cfdd4167602
|
|
@ -38,24 +38,10 @@
|
|||
|
||||
],
|
||||
"source":[
|
||||
"!apt install python3.10-venv\n",
|
||||
"!apt install python3.8-venv\n",
|
||||
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
||||
"%cd /content/ai-voice-cloning\n",
|
||||
"# get local dependencies\n",
|
||||
"!git submodule init\n",
|
||||
"!git submodule update --remote\n",
|
||||
"# setup venv\n",
|
||||
"!python3 -m venv venv\n",
|
||||
"!source ./venv/bin/activate\n",
|
||||
"!python3 -m pip install --upgrade pip # just to be safe\n",
|
||||
"# CUDA\n",
|
||||
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
|
||||
"# install requirements\n",
|
||||
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
|
||||
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
|
||||
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
|
||||
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
|
||||
"!python3 -m pip install -r ./requirements.txt # install local requirements"
|
||||
"!./setup-cuda.sh"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -129,8 +115,7 @@
|
|||
"cell_type":"code",
|
||||
"source":[
|
||||
"%cd /content/ai-voice-cloning/\n",
|
||||
"!source ./venv/bin/activate\n",
|
||||
"!python3 ./src/main.py --share"
|
||||
"!./start.sh --share"
|
||||
],
|
||||
"metadata":{
|
||||
"id":"QRA8jF3cF-YJ"
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch>=2.1.0
|
||||
torchvision
|
||||
torchaudio
|
||||
git+https://github.com/openai/whisper.git
|
||||
|
||||
openai-whisper
|
||||
more-itertools
|
||||
ffmpeg-python
|
||||
gradio<=3.23.0
|
||||
|
@ -13,5 +9,3 @@ psutil
|
|||
phonemizer
|
||||
pydantic==1.10.11
|
||||
websockets
|
||||
beartype==0.15.0
|
||||
pykakasi
|
|
@ -4,7 +4,7 @@ git submodule update --remote
|
|||
python -m venv venv
|
||||
call .\venv\Scripts\activate.bat
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
python -m pip install -r .\modules\tortoise-tts\requirements.txt
|
||||
python -m pip install -e .\modules\tortoise-tts\
|
||||
python -m pip install -r .\modules\dlas\requirements.txt
|
||||
|
|
|
@ -7,7 +7,7 @@ python3 -m venv venv
|
|||
source ./venv/bin/activate
|
||||
python3 -m pip install --upgrade pip # just to be safe
|
||||
# CUDA
|
||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
# install requirements
|
||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||
|
|
|
@ -7,7 +7,7 @@ python3 -m venv venv
|
|||
source ./venv/bin/activate
|
||||
python3 -m pip install --upgrade pip # just to be safe
|
||||
# ROCM
|
||||
pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
||||
pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
||||
# install requirements
|
||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||
|
|
42
src/utils.py
42
src/utils.py
|
@ -68,20 +68,8 @@ BARK_ENABLED = False
|
|||
|
||||
VERBOSE_DEBUG = True
|
||||
|
||||
KKS = None
|
||||
PYKAKASI_ENABLED = False
|
||||
|
||||
import traceback
|
||||
|
||||
try:
|
||||
import pykakasi
|
||||
KKS = pykakasi.kakasi()
|
||||
PYKAKASI_ENABLED = True
|
||||
except Exception as e:
|
||||
#if VERBOSE_DEBUG:
|
||||
# print(traceback.format_exc())
|
||||
pass
|
||||
|
||||
try:
|
||||
from whisper.normalizers.english import EnglishTextNormalizer
|
||||
from whisper.normalizers.basic import BasicTextNormalizer
|
||||
|
@ -184,6 +172,8 @@ if BARK_ENABLED:
|
|||
|
||||
try:
|
||||
from hubert.hubert_manager import HuBERTManager
|
||||
from hubert.pre_kmeans_hubert import CustomHubert
|
||||
from hubert.customtokenizer import CustomTokenizer
|
||||
|
||||
hubert_manager = HuBERTManager()
|
||||
hubert_manager.make_sure_hubert_installed()
|
||||
|
@ -254,9 +244,6 @@ if BARK_ENABLED:
|
|||
# generate semantic tokens
|
||||
|
||||
if self.hubert_enabled:
|
||||
from hubert.pre_kmeans_hubert import CustomHubert
|
||||
from hubert.customtokenizer import CustomTokenizer
|
||||
|
||||
wav = wav.to(self.device)
|
||||
|
||||
# Extract discrete codes from EnCodec
|
||||
|
@ -312,7 +299,7 @@ if BARK_ENABLED:
|
|||
semantic_tokens = text_to_semantic(text, history_prompt=voice, temp=text_temp, silent=False)
|
||||
audio_tokens = semantic_to_audio_tokens( semantic_tokens, history_prompt=voice, temp=waveform_temp, silent=False, output_full=False )
|
||||
|
||||
if self.vocos_enabled:
|
||||
if VOCOS_ENABLED:
|
||||
audio_tokens_torch = torch.from_numpy(audio_tokens).to(self.device)
|
||||
features = self.vocos.codes_to_features(audio_tokens_torch)
|
||||
wav = self.vocos.decode(features, bandwidth_id=torch.tensor([2], device=self.device))
|
||||
|
@ -523,7 +510,7 @@ def generate_bark(**kwargs):
|
|||
settings['datetime'] = datetime.now().isoformat()
|
||||
|
||||
# save here in case some error happens mid-batch
|
||||
if tts.vocos_enabled:
|
||||
if VOCOS_ENABLED:
|
||||
torchaudio.save(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', wav.cpu(), sr)
|
||||
else:
|
||||
write_wav(f'{outdir}/{cleanup_voice_name(voice)}_{name}.wav', sr, wav)
|
||||
|
@ -2005,7 +1992,7 @@ def run_training(config_path, verbose=False, keep_x_past_checkpoints=0, progress
|
|||
training_state = TrainingState(config_path=config_path, keep_x_past_checkpoints=keep_x_past_checkpoints)
|
||||
|
||||
for line in iter(training_state.process.stdout.readline, ""):
|
||||
if training_state is None or training_state.killed:
|
||||
if training_state.killed:
|
||||
return
|
||||
|
||||
result, percent, message = training_state.parse( line=line, verbose=verbose, keep_x_past_checkpoints=keep_x_past_checkpoints, progress=progress )
|
||||
|
@ -2677,8 +2664,8 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
|||
|
||||
|
||||
culled = len(text) < text_length
|
||||
if not culled and audio_length > 0:
|
||||
culled = duration < audio_length
|
||||
#if not culled and audio_length > 0:
|
||||
# culled = duration < audio_length
|
||||
|
||||
line = f'audio/{file}|{phonemes if phonemize and phonemes else text}'
|
||||
|
||||
|
@ -2746,14 +2733,6 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
|||
phn_file = jobs['phonemize'][0][i]
|
||||
normalized = jobs['phonemize'][1][i]
|
||||
|
||||
if language == "japanese":
|
||||
language = "ja"
|
||||
|
||||
if language == "ja" and PYKAKASI_ENABLED and KKS is not None:
|
||||
normalized = KKS.convert(normalized)
|
||||
normalized = [ n["hira"] for n in normalized ]
|
||||
normalized = "".join(normalized)
|
||||
|
||||
try:
|
||||
phonemized = valle_phonemize( normalized )
|
||||
open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized))
|
||||
|
@ -3292,7 +3271,6 @@ def setup_args(cli=False):
|
|||
'embed-output-metadata': True,
|
||||
'latents-lean-and-mean': True,
|
||||
'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
|
||||
'use-deepspeed': False,
|
||||
'voice-fixer-use-cuda': True,
|
||||
|
||||
|
||||
|
@ -3351,7 +3329,6 @@ def setup_args(cli=False):
|
|||
parser.add_argument("--latents-lean-and-mean", action='store_true', default=default_arguments['latents-lean-and-mean'], help="Exports the bare essentials for latents.")
|
||||
parser.add_argument("--voice-fixer", action='store_true', default=default_arguments['voice-fixer'], help="Uses python module 'voicefixer' to improve audio quality, if available.")
|
||||
parser.add_argument("--voice-fixer-use-cuda", action='store_true', default=default_arguments['voice-fixer-use-cuda'], help="Hints to voicefixer to use CUDA, if available.")
|
||||
parser.add_argument("--use-deepspeed", action='store_true', default=default_arguments['use-deepspeed'], help="Use deepspeed for speed bump.")
|
||||
parser.add_argument("--force-cpu-for-conditioning-latents", default=default_arguments['force-cpu-for-conditioning-latents'], action='store_true', help="Forces computing conditional latents to be done on the CPU (if you constantyl OOM on low chunk counts)")
|
||||
parser.add_argument("--defer-tts-load", default=default_arguments['defer-tts-load'], action='store_true', help="Defers loading TTS model")
|
||||
parser.add_argument("--prune-nonfinal-outputs", default=default_arguments['prune-nonfinal-outputs'], action='store_true', help="Deletes non-final output files on completing a generation")
|
||||
|
@ -3387,6 +3364,7 @@ def setup_args(cli=False):
|
|||
parser.add_argument("--websocket-listen-address", default=default_arguments['websocket-listen-address'], help="Websocket server listen address, default: 127.0.0.1")
|
||||
parser.add_argument("--websocket-enabled", action='store_true', default=default_arguments['websocket-enabled'], help="Websocket API server enabled, default: false")
|
||||
|
||||
parser.add_argument("--os", default="unix", help="Specifies which OS, easily")
|
||||
if cli:
|
||||
args, unknown = parser.parse_known_args()
|
||||
else:
|
||||
|
@ -3436,7 +3414,6 @@ def get_default_settings( hypenated=True ):
|
|||
'embed-output-metadata': args.embed_output_metadata,
|
||||
'latents-lean-and-mean': args.latents_lean_and_mean,
|
||||
'voice-fixer': args.voice_fixer,
|
||||
'use-deepspeed': args.use_deepspeed,
|
||||
'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
|
||||
'concurrency-count': args.concurrency_count,
|
||||
'output-sample-rate': args.output_sample_rate,
|
||||
|
@ -3490,7 +3467,6 @@ def update_args( **kwargs ):
|
|||
args.latents_lean_and_mean = settings['latents_lean_and_mean']
|
||||
args.voice_fixer = settings['voice_fixer']
|
||||
args.voice_fixer_use_cuda = settings['voice_fixer_use_cuda']
|
||||
args.use_deepspeed = settings['use_deepspeed']
|
||||
args.concurrency_count = settings['concurrency_count']
|
||||
args.output_sample_rate = 44000
|
||||
args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
|
||||
|
@ -3663,7 +3639,7 @@ def load_tts( restart=False,
|
|||
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
|
||||
|
||||
print(f"Loading TorToiSe... (AR: {autoregressive_model}, diffusion: {diffusion_model}, vocoder: {vocoder_model})")
|
||||
tts = TorToise_TTS(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, diffusion_model_path=diffusion_model, vocoder_model=vocoder_model, tokenizer_json=tokenizer_json, unsqueeze_sample_batches=args.unsqueeze_sample_batches, use_deepspeed=args.use_deepspeed)
|
||||
tts = TorToise_TTS(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, diffusion_model_path=diffusion_model, vocoder_model=vocoder_model, tokenizer_json=tokenizer_json, unsqueeze_sample_batches=args.unsqueeze_sample_batches)
|
||||
elif args.tts_backend == "vall-e":
|
||||
if valle_model:
|
||||
args.valle_model = valle_model
|
||||
|
|
|
@ -643,7 +643,6 @@ def setup_gradio():
|
|||
EXEC_SETTINGS['embed_output_metadata'] = gr.Checkbox(label="Embed Output Metadata", value=args.embed_output_metadata)
|
||||
EXEC_SETTINGS['latents_lean_and_mean'] = gr.Checkbox(label="Slimmer Computed Latents", value=args.latents_lean_and_mean)
|
||||
EXEC_SETTINGS['voice_fixer'] = gr.Checkbox(label="Use Voice Fixer on Generated Output", value=args.voice_fixer)
|
||||
EXEC_SETTINGS['use_deepspeed'] = gr.Checkbox(label="Use DeepSpeed for Speed Bump.", value=args.use_deepspeed)
|
||||
EXEC_SETTINGS['voice_fixer_use_cuda'] = gr.Checkbox(label="Use CUDA for Voice Fixer", value=args.voice_fixer_use_cuda)
|
||||
EXEC_SETTINGS['force_cpu_for_conditioning_latents'] = gr.Checkbox(label="Force CPU for Conditioning Latents", value=args.force_cpu_for_conditioning_latents)
|
||||
EXEC_SETTINGS['defer_tts_load'] = gr.Checkbox(label="Do Not Load TTS On Startup", value=args.defer_tts_load)
|
||||
|
|
Loading…
Reference in New Issue
Block a user