Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

19 changed files with 4217 additions and 5330 deletions

View File

@ -1,4 +0,0 @@
/models
/training
/voices
/bin

View File

@ -1,37 +0,0 @@
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
ARG DEBIAN_FRONTEND=noninteractive
ARG TZ=UTC
ARG MINICONDA_VERSION=23.1.0-1
ARG PYTHON_VERSION=3.9.13
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
RUN apt-get update
RUN apt install -y curl wget git ffmpeg
RUN adduser --disabled-password --gecos '' --shell /bin/bash user
USER user
ENV HOME=/home/user
WORKDIR $HOME
RUN mkdir $HOME/.cache $HOME/.config && chmod -R 777 $HOME
RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
RUN chmod +x Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
RUN ./Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh -b -p /home/user/miniconda
ENV PATH="$HOME/miniconda/bin:$PATH"
RUN conda init
RUN conda install python=$PYTHON_VERSION
RUN python3 -m pip install --upgrade pip
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
RUN mkdir $HOME/ai-voice-cloning
WORKDIR $HOME/ai-voice-cloning
COPY --chown=user:user modules modules
RUN python3 -m pip install -r ./modules/tortoise-tts/requirements.txt
RUN python3 -m pip install -e ./modules/tortoise-tts/
RUN python3 -m pip install -r ./modules/dlas/requirements.txt
RUN python3 -m pip install -e ./modules/dlas/
ADD requirements.txt requirements.txt
RUN python3 -m pip install -r ./requirements.txt
ADD --chown=user:user . $HOME/ai-voice-cloning
CMD ["python", "./src/main.py", "--listen", "0.0.0.0:7680"]

View File

@ -1,9 +1,9 @@
# AI Voice Cloning
> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
>\>Ugh... why bother when I can just abuse 11.AI?
You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
@ -16,4 +16,4 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for
## Bug Reporting
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.

View File

@ -1,106 +1,13 @@
dataset:
training: [
"./training/${voice}/valle/",
]
noise: [
"./training/valle/data/Other/noise/",
]
speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
use_hdf5: False
hdf5_name: data.h5
hdf5_flag: r
validate: True
data_dirs: [./training/${voice}/valle/]
spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"
workers: 4
cache: False
max_phones: 72
phones_range: [4, 64]
duration_range: [1.0, 8.0]
models: '${models}'
batch_size: ${batch_size}
gradient_accumulation_steps: ${gradient_accumulation_size}
eval_batch_size: ${batch_size}
random_utterance: 1.0
max_prompts: 3
prompt_duration: 3.0
sample_type: path
tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
models:
_max_levels: 8
_models:
- name: "ar"
size: "full"
resp_levels: 1
prom_levels: 2
tasks: 8
arch_type: "retnet"
- name: "nar"
size: "full"
resp_levels: 3
prom_levels: 4
tasks: 8
arch_type: "retnet"
hyperparameters:
batch_size: ${batch_size}
gradient_accumulation_steps: ${gradient_accumulation_size}
gradient_clipping: 100
optimizer: AdamW
learning_rate: 1.0e-4
scheduler_type: ""
evaluation:
batch_size: ${batch_size}
frequency: ${validation_rate}
size: 16
steps: 300
ar_temperature: 0.95
nar_temperature: 0.25
trainer:
iterations: ${iterations}
save_tag: step
save_on_oom: True
save_on_quit: True
export_on_save: True
export_on_quit: True
save_frequency: ${save_rate}
keep_last_checkpoints: 4
aggressive_optimizations: False
load_state_dict: True
#strict_loading: False
#load_tag: "9500"
#load_states: False
#restart_step_count: True
gc_mode: None # "global_step"
weight_dtype: bfloat16
backend: deepspeed
deepspeed:
zero_optimization_level: 2
use_compression_training: True
inference:
use_vocos: True
normalize: False
weight_dtype: float32
bitsandbytes:
enabled: False
injects: True
linear: True
embedding: True
max_iter: ${iterations}
save_ckpt_every: ${save_rate}
eval_every: ${validation_rate}

@ -1 +1 @@
Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
Subproject commit 0bcdf81d0444218b4dedaefa5c546d42f36b8130

View File

@ -38,24 +38,10 @@
],
"source":[
"!apt install python3.10-venv\n",
"!apt install python3.8-venv\n",
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
"%cd /content/ai-voice-cloning\n",
"# get local dependencies\n",
"!git submodule init\n",
"!git submodule update --remote\n",
"# setup venv\n",
"!python3 -m venv venv\n",
"!source ./venv/bin/activate\n",
"!python3 -m pip install --upgrade pip # just to be safe\n",
"# CUDA\n",
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
"# install requirements\n",
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
"!python3 -m pip install -r ./requirements.txt # install local requirements"
"!./setup-cuda.sh"
]
},
{
@ -129,8 +115,7 @@
"cell_type":"code",
"source":[
"%cd /content/ai-voice-cloning/\n",
"!source ./venv/bin/activate\n",
"!python3 ./src/main.py --share"
"!./start.sh --share"
],
"metadata":{
"id":"QRA8jF3cF-YJ"

View File

@ -1,17 +1,9 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch>=2.1.0
torchvision
torchaudio
git+https://github.com/openai/whisper.git
openai-whisper
more-itertools
ffmpeg-python
gradio<=3.23.0
gradio
music-tag
voicefixer
psutil
phonemizer
pydantic==1.10.11
websockets
beartype==0.15.0
pykakasi

View File

@ -4,7 +4,7 @@ git submodule update --remote
python -m venv venv
call .\venv\Scripts\activate.bat
python -m pip install --upgrade pip
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
python -m pip install -r .\modules\tortoise-tts\requirements.txt
python -m pip install -e .\modules\tortoise-tts\
python -m pip install -r .\modules\dlas\requirements.txt

View File

@ -7,7 +7,7 @@ python3 -m venv venv
source ./venv/bin/activate
python3 -m pip install --upgrade pip # just to be safe
# CUDA
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
# install requirements
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe

View File

@ -4,7 +4,7 @@ git submodule update --remote
python -m venv venv
call .\venv\Scripts\activate.bat
python -m pip install --upgrade pip
python -m pip install torch torchvision torchaudio torch-directml
python -m pip install torch==1.13.1 torchvision torchaudio torch-directml
python -m pip install -r .\modules\tortoise-tts\requirements.txt
python -m pip install -e .\modules\tortoise-tts\
python -m pip install -r .\modules\dlas\requirements.txt

View File

@ -1,4 +0,0 @@
#!/bin/bash
git submodule init
git submodule update --remote
docker build -t ai-voice-cloning .

View File

@ -7,7 +7,7 @@ python3 -m venv venv
source ./venv/bin/activate
python3 -m pip install --upgrade pip # just to be safe
# ROCM
pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
# install requirements
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe

View File

@ -1,84 +0,0 @@
import asyncio
import json
from threading import Thread
from websockets.server import serve
from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
# this is a not so nice workaround to set values to None if their string value is "None"
def replaceNoneStringWithNone(message):
ignore_fields = ['text'] # list of fields which CAN have "None" as literal String value
for member in message:
if message[member] == 'None' and member not in ignore_fields:
message[member] = None
return message
async def _handle_generate(websocket, message):
# update args parameters which control the model settings
if message.get('autoregressive_model'):
update_autoregressive_model(message['autoregressive_model'])
if message.get('diffusion_model'):
update_diffusion_model(message['diffusion_model'])
if message.get('tokenizer_json'):
update_tokenizer(message['tokenizer_json'])
if message.get('sample_batch_size'):
global args
args.sample_batch_size = message['sample_batch_size']
message['result'] = generate(**message)
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_get_autoregressive_models(websocket, message):
message['result'] = get_autoregressive_models()
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_get_voice_list(websocket, message):
message['result'] = get_voice_list()
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_message(websocket, message):
message = replaceNoneStringWithNone(message)
if message.get('action') and message['action'] == 'generate':
await _handle_generate(websocket, message)
elif message.get('action') and message['action'] == 'get_voices':
await _handle_get_voice_list(websocket, message)
elif message.get('action') and message['action'] == 'get_autoregressive_models':
await _handle_get_autoregressive_models(websocket, message)
else:
print("websocket: undhandled message: " + message)
async def _handle_connection(websocket, path):
print("websocket: client connected")
async for message in websocket:
try:
await _handle_message(websocket, json.loads(message))
except ValueError:
print("websocket: malformed json received")
async def _run(host: str, port: int):
print(f"websocket: server started on ws://{host}:{port}")
async with serve(_handle_connection, host, port, ping_interval=None):
await asyncio.Future() # run forever
def _run_server(listen_address: str, port: int):
asyncio.run(_run(host=listen_address, port=port))
def start_websocket_server(listen_address: str, port: int):
Thread(target=_run_server, args=[listen_address, port], daemon=True).start()

View File

@ -1,66 +0,0 @@
import os
import argparse
if 'TORTOISE_MODELS_DIR' not in os.environ:
os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/'))
if 'TRANSFORMERS_CACHE' not in os.environ:
os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/'))
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
from utils import *
if __name__ == "__main__":
args = setup_args(cli=True)
default_arguments = import_generate_settings()
parser = argparse.ArgumentParser(allow_abbrev=False)
parser.add_argument("--text", default=default_arguments['text'])
parser.add_argument("--delimiter", default=default_arguments['delimiter'])
parser.add_argument("--emotion", default=default_arguments['emotion'])
parser.add_argument("--prompt", default=default_arguments['prompt'])
parser.add_argument("--voice", default=default_arguments['voice'])
parser.add_argument("--mic_audio", default=default_arguments['mic_audio'])
parser.add_argument("--voice_latents_chunks", default=default_arguments['voice_latents_chunks'])
parser.add_argument("--candidates", default=default_arguments['candidates'])
parser.add_argument("--seed", default=default_arguments['seed'])
parser.add_argument("--num_autoregressive_samples", default=default_arguments['num_autoregressive_samples'])
parser.add_argument("--diffusion_iterations", default=default_arguments['diffusion_iterations'])
parser.add_argument("--temperature", default=default_arguments['temperature'])
parser.add_argument("--diffusion_sampler", default=default_arguments['diffusion_sampler'])
parser.add_argument("--breathing_room", default=default_arguments['breathing_room'])
parser.add_argument("--cvvp_weight", default=default_arguments['cvvp_weight'])
parser.add_argument("--top_p", default=default_arguments['top_p'])
parser.add_argument("--diffusion_temperature", default=default_arguments['diffusion_temperature'])
parser.add_argument("--length_penalty", default=default_arguments['length_penalty'])
parser.add_argument("--repetition_penalty", default=default_arguments['repetition_penalty'])
parser.add_argument("--cond_free_k", default=default_arguments['cond_free_k'])
args, unknown = parser.parse_known_args()
kwargs = {
'text': args.text,
'delimiter': args.delimiter,
'emotion': args.emotion,
'prompt': args.prompt,
'voice': args.voice,
'mic_audio': args.mic_audio,
'voice_latents_chunks': args.voice_latents_chunks,
'candidates': args.candidates,
'seed': args.seed,
'num_autoregressive_samples': args.num_autoregressive_samples,
'diffusion_iterations': args.diffusion_iterations,
'temperature': args.temperature,
'diffusion_sampler': args.diffusion_sampler,
'breathing_room': args.breathing_room,
'cvvp_weight': args.cvvp_weight,
'top_p': args.top_p,
'diffusion_temperature': args.diffusion_temperature,
'length_penalty': args.length_penalty,
'repetition_penalty': args.repetition_penalty,
'cond_free_k': args.cond_free_k,
'experimentals': default_arguments['experimentals'],
}
tts = load_tts()
generate(**kwargs)

View File

@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
from utils import *
from webui import *
from api.websocket_server import start_websocket_server
if __name__ == "__main__":
args = setup_args()
@ -26,9 +23,6 @@ if __name__ == "__main__":
if not args.defer_tts_load:
tts = load_tts()
if args.websocket_enabled:
start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
webui.block_thread()
elif __name__ == "main":
from fastapi import FastAPI
@ -43,5 +37,4 @@ elif __name__ == "main":
app = gr.mount_gradio_app(app, webui, path=args.listen_path)
if not args.defer_tts_load:
tts = load_tts()
tts = load_tts()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +0,0 @@
#!/bin/bash
CMD="python3 ./src/main.py $@"
# CMD="bash"
CPATH="/home/user/ai-voice-cloning"
docker run --rm --gpus all \
--mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
--mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
--mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
--mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
--workdir $CPATH \
--user "$(id -u):$(id -g)" \
--net host \
-it ai-voice-cloning $CMD

View File

@ -1,15 +0,0 @@
#!/bin/bash
CMD="python3 ./src/train.py --yaml $1"
# ipc host is one way to increase the shared memory for the container
# more info here https://github.com/pytorch/pytorch#docker-image
CPATH="/home/user/ai-voice-cloning"
docker run --rm --gpus all \
--mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
--mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
--mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
--mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
--mount "type=bind,src=$PWD/src,dst=$CPATH/src" \
--workdir $CPATH \
--ipc host \
--user "$(id -u):$(id -g)" \
-it ai-voice-cloning $CMD