Compare commits
No commits in common. "master" and "master" have entirely different histories.
|
@ -1,4 +0,0 @@
|
|||
/models
|
||||
/training
|
||||
/voices
|
||||
/bin
|
37
Dockerfile
37
Dockerfile
|
@ -1,37 +0,0 @@
|
|||
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
|
||||
|
||||
ARG DEBIAN_FRONTEND=noninteractive
|
||||
ARG TZ=UTC
|
||||
ARG MINICONDA_VERSION=23.1.0-1
|
||||
ARG PYTHON_VERSION=3.9.13
|
||||
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
RUN apt-get update
|
||||
RUN apt install -y curl wget git ffmpeg
|
||||
RUN adduser --disabled-password --gecos '' --shell /bin/bash user
|
||||
USER user
|
||||
ENV HOME=/home/user
|
||||
WORKDIR $HOME
|
||||
RUN mkdir $HOME/.cache $HOME/.config && chmod -R 777 $HOME
|
||||
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
|
||||
RUN chmod +x Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
|
||||
RUN ./Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh -b -p /home/user/miniconda
|
||||
ENV PATH="$HOME/miniconda/bin:$PATH"
|
||||
RUN conda init
|
||||
RUN conda install python=$PYTHON_VERSION
|
||||
RUN python3 -m pip install --upgrade pip
|
||||
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
|
||||
RUN mkdir $HOME/ai-voice-cloning
|
||||
WORKDIR $HOME/ai-voice-cloning
|
||||
COPY --chown=user:user modules modules
|
||||
|
||||
RUN python3 -m pip install -r ./modules/tortoise-tts/requirements.txt
|
||||
RUN python3 -m pip install -e ./modules/tortoise-tts/
|
||||
RUN python3 -m pip install -r ./modules/dlas/requirements.txt
|
||||
RUN python3 -m pip install -e ./modules/dlas/
|
||||
ADD requirements.txt requirements.txt
|
||||
RUN python3 -m pip install -r ./requirements.txt
|
||||
ADD --chown=user:user . $HOME/ai-voice-cloning
|
||||
|
||||
CMD ["python", "./src/main.py", "--listen", "0.0.0.0:7680"]
|
|
@ -1,9 +1,9 @@
|
|||
# AI Voice Cloning
|
||||
|
||||
> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
|
||||
|
||||
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
||||
|
||||
Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
|
||||
|
||||
>\>Ugh... why bother when I can just abuse 11.AI?
|
||||
|
||||
You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
|
||||
|
@ -16,4 +16,4 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for
|
|||
|
||||
## Bug Reporting
|
||||
|
||||
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
|
||||
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
|
|
@ -1,106 +1,13 @@
|
|||
dataset:
|
||||
training: [
|
||||
"./training/${voice}/valle/",
|
||||
]
|
||||
noise: [
|
||||
"./training/valle/data/Other/noise/",
|
||||
]
|
||||
|
||||
speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
|
||||
|
||||
use_hdf5: False
|
||||
hdf5_name: data.h5
|
||||
hdf5_flag: r
|
||||
validate: True
|
||||
data_dirs: [./training/${voice}/valle/]
|
||||
spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"
|
||||
|
||||
workers: 4
|
||||
cache: False
|
||||
max_phones: 72
|
||||
|
||||
phones_range: [4, 64]
|
||||
duration_range: [1.0, 8.0]
|
||||
models: '${models}'
|
||||
batch_size: ${batch_size}
|
||||
gradient_accumulation_steps: ${gradient_accumulation_size}
|
||||
eval_batch_size: ${batch_size}
|
||||
|
||||
random_utterance: 1.0
|
||||
max_prompts: 3
|
||||
prompt_duration: 3.0
|
||||
|
||||
sample_type: path
|
||||
|
||||
tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
|
||||
|
||||
models:
|
||||
_max_levels: 8
|
||||
_models:
|
||||
- name: "ar"
|
||||
size: "full"
|
||||
resp_levels: 1
|
||||
prom_levels: 2
|
||||
tasks: 8
|
||||
arch_type: "retnet"
|
||||
|
||||
- name: "nar"
|
||||
size: "full"
|
||||
resp_levels: 3
|
||||
prom_levels: 4
|
||||
tasks: 8
|
||||
arch_type: "retnet"
|
||||
|
||||
|
||||
hyperparameters:
|
||||
batch_size: ${batch_size}
|
||||
gradient_accumulation_steps: ${gradient_accumulation_size}
|
||||
gradient_clipping: 100
|
||||
|
||||
optimizer: AdamW
|
||||
learning_rate: 1.0e-4
|
||||
|
||||
scheduler_type: ""
|
||||
|
||||
evaluation:
|
||||
batch_size: ${batch_size}
|
||||
frequency: ${validation_rate}
|
||||
size: 16
|
||||
|
||||
steps: 300
|
||||
ar_temperature: 0.95
|
||||
nar_temperature: 0.25
|
||||
|
||||
trainer:
|
||||
iterations: ${iterations}
|
||||
|
||||
save_tag: step
|
||||
save_on_oom: True
|
||||
save_on_quit: True
|
||||
export_on_save: True
|
||||
export_on_quit: True
|
||||
save_frequency: ${save_rate}
|
||||
|
||||
keep_last_checkpoints: 4
|
||||
|
||||
aggressive_optimizations: False
|
||||
|
||||
load_state_dict: True
|
||||
#strict_loading: False
|
||||
#load_tag: "9500"
|
||||
#load_states: False
|
||||
#restart_step_count: True
|
||||
|
||||
gc_mode: None # "global_step"
|
||||
|
||||
weight_dtype: bfloat16
|
||||
|
||||
backend: deepspeed
|
||||
deepspeed:
|
||||
zero_optimization_level: 2
|
||||
use_compression_training: True
|
||||
|
||||
inference:
|
||||
use_vocos: True
|
||||
normalize: False
|
||||
|
||||
weight_dtype: float32
|
||||
|
||||
bitsandbytes:
|
||||
enabled: False
|
||||
injects: True
|
||||
linear: True
|
||||
embedding: True
|
||||
max_iter: ${iterations}
|
||||
save_ckpt_every: ${save_rate}
|
||||
eval_every: ${validation_rate}
|
|
@ -1 +1 @@
|
|||
Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
|
||||
Subproject commit 0bcdf81d0444218b4dedaefa5c546d42f36b8130
|
|
@ -38,24 +38,10 @@
|
|||
|
||||
],
|
||||
"source":[
|
||||
"!apt install python3.10-venv\n",
|
||||
"!apt install python3.8-venv\n",
|
||||
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
||||
"%cd /content/ai-voice-cloning\n",
|
||||
"# get local dependencies\n",
|
||||
"!git submodule init\n",
|
||||
"!git submodule update --remote\n",
|
||||
"# setup venv\n",
|
||||
"!python3 -m venv venv\n",
|
||||
"!source ./venv/bin/activate\n",
|
||||
"!python3 -m pip install --upgrade pip # just to be safe\n",
|
||||
"# CUDA\n",
|
||||
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
|
||||
"# install requirements\n",
|
||||
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
|
||||
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
|
||||
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
|
||||
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
|
||||
"!python3 -m pip install -r ./requirements.txt # install local requirements"
|
||||
"!./setup-cuda.sh"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -129,8 +115,7 @@
|
|||
"cell_type":"code",
|
||||
"source":[
|
||||
"%cd /content/ai-voice-cloning/\n",
|
||||
"!source ./venv/bin/activate\n",
|
||||
"!python3 ./src/main.py --share"
|
||||
"!./start.sh --share"
|
||||
],
|
||||
"metadata":{
|
||||
"id":"QRA8jF3cF-YJ"
|
||||
|
|
|
@ -1,17 +1,9 @@
|
|||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
torch>=2.1.0
|
||||
torchvision
|
||||
torchaudio
|
||||
git+https://github.com/openai/whisper.git
|
||||
|
||||
openai-whisper
|
||||
more-itertools
|
||||
ffmpeg-python
|
||||
gradio<=3.23.0
|
||||
gradio
|
||||
music-tag
|
||||
voicefixer
|
||||
psutil
|
||||
phonemizer
|
||||
pydantic==1.10.11
|
||||
websockets
|
||||
beartype==0.15.0
|
||||
pykakasi
|
|
@ -4,7 +4,7 @@ git submodule update --remote
|
|||
python -m venv venv
|
||||
call .\venv\Scripts\activate.bat
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
python -m pip install -r .\modules\tortoise-tts\requirements.txt
|
||||
python -m pip install -e .\modules\tortoise-tts\
|
||||
python -m pip install -r .\modules\dlas\requirements.txt
|
||||
|
|
|
@ -7,7 +7,7 @@ python3 -m venv venv
|
|||
source ./venv/bin/activate
|
||||
python3 -m pip install --upgrade pip # just to be safe
|
||||
# CUDA
|
||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
||||
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
# install requirements
|
||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||
|
|
|
@ -4,7 +4,7 @@ git submodule update --remote
|
|||
python -m venv venv
|
||||
call .\venv\Scripts\activate.bat
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install torch torchvision torchaudio torch-directml
|
||||
python -m pip install torch==1.13.1 torchvision torchaudio torch-directml
|
||||
python -m pip install -r .\modules\tortoise-tts\requirements.txt
|
||||
python -m pip install -e .\modules\tortoise-tts\
|
||||
python -m pip install -r .\modules\dlas\requirements.txt
|
||||
|
|
|
@ -1,4 +0,0 @@
|
|||
#!/bin/bash
|
||||
git submodule init
|
||||
git submodule update --remote
|
||||
docker build -t ai-voice-cloning .
|
|
@ -7,7 +7,7 @@ python3 -m venv venv
|
|||
source ./venv/bin/activate
|
||||
python3 -m pip install --upgrade pip # just to be safe
|
||||
# ROCM
|
||||
pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
||||
pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
||||
# install requirements
|
||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||
|
|
|
@ -1,84 +0,0 @@
|
|||
import asyncio
|
||||
import json
|
||||
from threading import Thread
|
||||
|
||||
from websockets.server import serve
|
||||
|
||||
from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
|
||||
|
||||
# this is a not so nice workaround to set values to None if their string value is "None"
|
||||
def replaceNoneStringWithNone(message):
|
||||
ignore_fields = ['text'] # list of fields which CAN have "None" as literal String value
|
||||
|
||||
for member in message:
|
||||
if message[member] == 'None' and member not in ignore_fields:
|
||||
message[member] = None
|
||||
|
||||
return message
|
||||
|
||||
|
||||
async def _handle_generate(websocket, message):
|
||||
# update args parameters which control the model settings
|
||||
if message.get('autoregressive_model'):
|
||||
update_autoregressive_model(message['autoregressive_model'])
|
||||
|
||||
if message.get('diffusion_model'):
|
||||
update_diffusion_model(message['diffusion_model'])
|
||||
|
||||
if message.get('tokenizer_json'):
|
||||
update_tokenizer(message['tokenizer_json'])
|
||||
|
||||
if message.get('sample_batch_size'):
|
||||
global args
|
||||
args.sample_batch_size = message['sample_batch_size']
|
||||
|
||||
message['result'] = generate(**message)
|
||||
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
|
||||
|
||||
|
||||
async def _handle_get_autoregressive_models(websocket, message):
|
||||
message['result'] = get_autoregressive_models()
|
||||
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
|
||||
|
||||
|
||||
async def _handle_get_voice_list(websocket, message):
|
||||
message['result'] = get_voice_list()
|
||||
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
|
||||
|
||||
|
||||
async def _handle_message(websocket, message):
|
||||
message = replaceNoneStringWithNone(message)
|
||||
|
||||
if message.get('action') and message['action'] == 'generate':
|
||||
await _handle_generate(websocket, message)
|
||||
elif message.get('action') and message['action'] == 'get_voices':
|
||||
await _handle_get_voice_list(websocket, message)
|
||||
elif message.get('action') and message['action'] == 'get_autoregressive_models':
|
||||
await _handle_get_autoregressive_models(websocket, message)
|
||||
else:
|
||||
print("websocket: undhandled message: " + message)
|
||||
|
||||
|
||||
async def _handle_connection(websocket, path):
|
||||
print("websocket: client connected")
|
||||
|
||||
async for message in websocket:
|
||||
try:
|
||||
await _handle_message(websocket, json.loads(message))
|
||||
except ValueError:
|
||||
print("websocket: malformed json received")
|
||||
|
||||
|
||||
async def _run(host: str, port: int):
|
||||
print(f"websocket: server started on ws://{host}:{port}")
|
||||
|
||||
async with serve(_handle_connection, host, port, ping_interval=None):
|
||||
await asyncio.Future() # run forever
|
||||
|
||||
|
||||
def _run_server(listen_address: str, port: int):
|
||||
asyncio.run(_run(host=listen_address, port=port))
|
||||
|
||||
|
||||
def start_websocket_server(listen_address: str, port: int):
|
||||
Thread(target=_run_server, args=[listen_address, port], daemon=True).start()
|
66
src/cli.py
66
src/cli.py
|
@ -1,66 +0,0 @@
|
|||
import os
|
||||
import argparse
|
||||
|
||||
if 'TORTOISE_MODELS_DIR' not in os.environ:
|
||||
os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/'))
|
||||
|
||||
if 'TRANSFORMERS_CACHE' not in os.environ:
|
||||
os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/'))
|
||||
|
||||
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
|
||||
|
||||
from utils import *
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = setup_args(cli=True)
|
||||
|
||||
default_arguments = import_generate_settings()
|
||||
parser = argparse.ArgumentParser(allow_abbrev=False)
|
||||
parser.add_argument("--text", default=default_arguments['text'])
|
||||
parser.add_argument("--delimiter", default=default_arguments['delimiter'])
|
||||
parser.add_argument("--emotion", default=default_arguments['emotion'])
|
||||
parser.add_argument("--prompt", default=default_arguments['prompt'])
|
||||
parser.add_argument("--voice", default=default_arguments['voice'])
|
||||
parser.add_argument("--mic_audio", default=default_arguments['mic_audio'])
|
||||
parser.add_argument("--voice_latents_chunks", default=default_arguments['voice_latents_chunks'])
|
||||
parser.add_argument("--candidates", default=default_arguments['candidates'])
|
||||
parser.add_argument("--seed", default=default_arguments['seed'])
|
||||
parser.add_argument("--num_autoregressive_samples", default=default_arguments['num_autoregressive_samples'])
|
||||
parser.add_argument("--diffusion_iterations", default=default_arguments['diffusion_iterations'])
|
||||
parser.add_argument("--temperature", default=default_arguments['temperature'])
|
||||
parser.add_argument("--diffusion_sampler", default=default_arguments['diffusion_sampler'])
|
||||
parser.add_argument("--breathing_room", default=default_arguments['breathing_room'])
|
||||
parser.add_argument("--cvvp_weight", default=default_arguments['cvvp_weight'])
|
||||
parser.add_argument("--top_p", default=default_arguments['top_p'])
|
||||
parser.add_argument("--diffusion_temperature", default=default_arguments['diffusion_temperature'])
|
||||
parser.add_argument("--length_penalty", default=default_arguments['length_penalty'])
|
||||
parser.add_argument("--repetition_penalty", default=default_arguments['repetition_penalty'])
|
||||
parser.add_argument("--cond_free_k", default=default_arguments['cond_free_k'])
|
||||
|
||||
args, unknown = parser.parse_known_args()
|
||||
kwargs = {
|
||||
'text': args.text,
|
||||
'delimiter': args.delimiter,
|
||||
'emotion': args.emotion,
|
||||
'prompt': args.prompt,
|
||||
'voice': args.voice,
|
||||
'mic_audio': args.mic_audio,
|
||||
'voice_latents_chunks': args.voice_latents_chunks,
|
||||
'candidates': args.candidates,
|
||||
'seed': args.seed,
|
||||
'num_autoregressive_samples': args.num_autoregressive_samples,
|
||||
'diffusion_iterations': args.diffusion_iterations,
|
||||
'temperature': args.temperature,
|
||||
'diffusion_sampler': args.diffusion_sampler,
|
||||
'breathing_room': args.breathing_room,
|
||||
'cvvp_weight': args.cvvp_weight,
|
||||
'top_p': args.top_p,
|
||||
'diffusion_temperature': args.diffusion_temperature,
|
||||
'length_penalty': args.length_penalty,
|
||||
'repetition_penalty': args.repetition_penalty,
|
||||
'cond_free_k': args.cond_free_k,
|
||||
'experimentals': default_arguments['experimentals'],
|
||||
}
|
||||
|
||||
tts = load_tts()
|
||||
generate(**kwargs)
|
|
@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
|
|||
from utils import *
|
||||
from webui import *
|
||||
|
||||
from api.websocket_server import start_websocket_server
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = setup_args()
|
||||
|
||||
|
@ -26,9 +23,6 @@ if __name__ == "__main__":
|
|||
if not args.defer_tts_load:
|
||||
tts = load_tts()
|
||||
|
||||
if args.websocket_enabled:
|
||||
start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
|
||||
|
||||
webui.block_thread()
|
||||
elif __name__ == "main":
|
||||
from fastapi import FastAPI
|
||||
|
@ -43,5 +37,4 @@ elif __name__ == "main":
|
|||
app = gr.mount_gradio_app(app, webui, path=args.listen_path)
|
||||
|
||||
if not args.defer_tts_load:
|
||||
tts = load_tts()
|
||||
|
||||
tts = load_tts()
|
7263
src/utils.py
7263
src/utils.py
File diff suppressed because it is too large
Load Diff
1889
src/webui.py
1889
src/webui.py
File diff suppressed because it is too large
Load Diff
|
@ -1,14 +0,0 @@
|
|||
#!/bin/bash
|
||||
CMD="python3 ./src/main.py $@"
|
||||
# CMD="bash"
|
||||
CPATH="/home/user/ai-voice-cloning"
|
||||
docker run --rm --gpus all \
|
||||
--mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
|
||||
--mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
|
||||
--mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
|
||||
--mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
|
||||
--workdir $CPATH \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--net host \
|
||||
-it ai-voice-cloning $CMD
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
CMD="python3 ./src/train.py --yaml $1"
|
||||
# ipc host is one way to increase the shared memory for the container
|
||||
# more info here https://github.com/pytorch/pytorch#docker-image
|
||||
CPATH="/home/user/ai-voice-cloning"
|
||||
docker run --rm --gpus all \
|
||||
--mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
|
||||
--mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
|
||||
--mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
|
||||
--mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
|
||||
--mount "type=bind,src=$PWD/src,dst=$CPATH/src" \
|
||||
--workdir $CPATH \
|
||||
--ipc host \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
-it ai-voice-cloning $CMD
|
Loading…
Reference in New Issue
Block a user