Compare commits
No commits in common. "master" and "master" have entirely different histories.
2
Dockerfile
Executable file → Normal file
2
Dockerfile
Executable file → Normal file
|
@ -20,7 +20,7 @@ ENV PATH="$HOME/miniconda/bin:$PATH"
|
||||||
RUN conda init
|
RUN conda init
|
||||||
RUN conda install python=$PYTHON_VERSION
|
RUN conda install python=$PYTHON_VERSION
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
|
|
||||||
RUN mkdir $HOME/ai-voice-cloning
|
RUN mkdir $HOME/ai-voice-cloning
|
||||||
WORKDIR $HOME/ai-voice-cloning
|
WORKDIR $HOME/ai-voice-cloning
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
# AI Voice Cloning
|
# AI Voice Cloning
|
||||||
|
|
||||||
> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
|
|
||||||
|
|
||||||
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
||||||
|
|
||||||
|
Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
|
||||||
|
|
||||||
>\>Ugh... why bother when I can just abuse 11.AI?
|
>\>Ugh... why bother when I can just abuse 11.AI?
|
||||||
|
|
||||||
You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
|
You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
|
||||||
|
@ -16,4 +16,4 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for
|
||||||
|
|
||||||
## Bug Reporting
|
## Bug Reporting
|
||||||
|
|
||||||
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
|
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
|
|
@ -1,106 +1,13 @@
|
||||||
dataset:
|
data_dirs: [./training/${voice}/valle/]
|
||||||
training: [
|
spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"
|
||||||
"./training/${voice}/valle/",
|
|
||||||
]
|
|
||||||
noise: [
|
|
||||||
"./training/valle/data/Other/noise/",
|
|
||||||
]
|
|
||||||
|
|
||||||
speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
|
|
||||||
|
|
||||||
use_hdf5: False
|
|
||||||
hdf5_name: data.h5
|
|
||||||
hdf5_flag: r
|
|
||||||
validate: True
|
|
||||||
|
|
||||||
workers: 4
|
max_phones: 72
|
||||||
cache: False
|
|
||||||
|
|
||||||
phones_range: [4, 64]
|
models: '${models}'
|
||||||
duration_range: [1.0, 8.0]
|
batch_size: ${batch_size}
|
||||||
|
gradient_accumulation_steps: ${gradient_accumulation_size}
|
||||||
|
eval_batch_size: ${batch_size}
|
||||||
|
|
||||||
random_utterance: 1.0
|
max_iter: ${iterations}
|
||||||
max_prompts: 3
|
save_ckpt_every: ${save_rate}
|
||||||
prompt_duration: 3.0
|
eval_every: ${validation_rate}
|
||||||
|
|
||||||
sample_type: path
|
|
||||||
|
|
||||||
tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
|
|
||||||
|
|
||||||
models:
|
|
||||||
_max_levels: 8
|
|
||||||
_models:
|
|
||||||
- name: "ar"
|
|
||||||
size: "full"
|
|
||||||
resp_levels: 1
|
|
||||||
prom_levels: 2
|
|
||||||
tasks: 8
|
|
||||||
arch_type: "retnet"
|
|
||||||
|
|
||||||
- name: "nar"
|
|
||||||
size: "full"
|
|
||||||
resp_levels: 3
|
|
||||||
prom_levels: 4
|
|
||||||
tasks: 8
|
|
||||||
arch_type: "retnet"
|
|
||||||
|
|
||||||
|
|
||||||
hyperparameters:
|
|
||||||
batch_size: ${batch_size}
|
|
||||||
gradient_accumulation_steps: ${gradient_accumulation_size}
|
|
||||||
gradient_clipping: 100
|
|
||||||
|
|
||||||
optimizer: AdamW
|
|
||||||
learning_rate: 1.0e-4
|
|
||||||
|
|
||||||
scheduler_type: ""
|
|
||||||
|
|
||||||
evaluation:
|
|
||||||
batch_size: ${batch_size}
|
|
||||||
frequency: ${validation_rate}
|
|
||||||
size: 16
|
|
||||||
|
|
||||||
steps: 300
|
|
||||||
ar_temperature: 0.95
|
|
||||||
nar_temperature: 0.25
|
|
||||||
|
|
||||||
trainer:
|
|
||||||
iterations: ${iterations}
|
|
||||||
|
|
||||||
save_tag: step
|
|
||||||
save_on_oom: True
|
|
||||||
save_on_quit: True
|
|
||||||
export_on_save: True
|
|
||||||
export_on_quit: True
|
|
||||||
save_frequency: ${save_rate}
|
|
||||||
|
|
||||||
keep_last_checkpoints: 4
|
|
||||||
|
|
||||||
aggressive_optimizations: False
|
|
||||||
|
|
||||||
load_state_dict: True
|
|
||||||
#strict_loading: False
|
|
||||||
#load_tag: "9500"
|
|
||||||
#load_states: False
|
|
||||||
#restart_step_count: True
|
|
||||||
|
|
||||||
gc_mode: None # "global_step"
|
|
||||||
|
|
||||||
weight_dtype: bfloat16
|
|
||||||
|
|
||||||
backend: deepspeed
|
|
||||||
deepspeed:
|
|
||||||
zero_optimization_level: 2
|
|
||||||
use_compression_training: True
|
|
||||||
|
|
||||||
inference:
|
|
||||||
use_vocos: True
|
|
||||||
normalize: False
|
|
||||||
|
|
||||||
weight_dtype: float32
|
|
||||||
|
|
||||||
bitsandbytes:
|
|
||||||
enabled: False
|
|
||||||
injects: True
|
|
||||||
linear: True
|
|
||||||
embedding: True
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
|
Subproject commit 5ff00bf3bfa97e2c8e9f166b920273f83ac9d8f0
|
|
@ -38,24 +38,10 @@
|
||||||
|
|
||||||
],
|
],
|
||||||
"source":[
|
"source":[
|
||||||
"!apt install python3.10-venv\n",
|
"!apt install python3.8-venv\n",
|
||||||
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
||||||
"%cd /content/ai-voice-cloning\n",
|
"%cd /content/ai-voice-cloning\n",
|
||||||
"# get local dependencies\n",
|
"!./setup-cuda.sh"
|
||||||
"!git submodule init\n",
|
|
||||||
"!git submodule update --remote\n",
|
|
||||||
"# setup venv\n",
|
|
||||||
"!python3 -m venv venv\n",
|
|
||||||
"!source ./venv/bin/activate\n",
|
|
||||||
"!python3 -m pip install --upgrade pip # just to be safe\n",
|
|
||||||
"# CUDA\n",
|
|
||||||
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
|
|
||||||
"# install requirements\n",
|
|
||||||
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
|
|
||||||
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
|
|
||||||
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
|
|
||||||
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
|
|
||||||
"!python3 -m pip install -r ./requirements.txt # install local requirements"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -129,8 +115,7 @@
|
||||||
"cell_type":"code",
|
"cell_type":"code",
|
||||||
"source":[
|
"source":[
|
||||||
"%cd /content/ai-voice-cloning/\n",
|
"%cd /content/ai-voice-cloning/\n",
|
||||||
"!source ./venv/bin/activate\n",
|
"!./start.sh --share"
|
||||||
"!python3 ./src/main.py --share"
|
|
||||||
],
|
],
|
||||||
"metadata":{
|
"metadata":{
|
||||||
"id":"QRA8jF3cF-YJ"
|
"id":"QRA8jF3cF-YJ"
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
git+https://github.com/openai/whisper.git
|
||||||
torch>=2.1.0
|
|
||||||
torchvision
|
|
||||||
torchaudio
|
|
||||||
|
|
||||||
openai-whisper
|
|
||||||
more-itertools
|
more-itertools
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
gradio<=3.23.0
|
gradio<=3.23.0
|
||||||
|
@ -11,7 +7,4 @@ music-tag
|
||||||
voicefixer
|
voicefixer
|
||||||
psutil
|
psutil
|
||||||
phonemizer
|
phonemizer
|
||||||
pydantic==1.10.11
|
pydantic==1.10.11
|
||||||
websockets
|
|
||||||
beartype==0.15.0
|
|
||||||
pykakasi
|
|
|
@ -4,7 +4,7 @@ git submodule update --remote
|
||||||
python -m venv venv
|
python -m venv venv
|
||||||
call .\venv\Scripts\activate.bat
|
call .\venv\Scripts\activate.bat
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
python -m pip install -r .\modules\tortoise-tts\requirements.txt
|
python -m pip install -r .\modules\tortoise-tts\requirements.txt
|
||||||
python -m pip install -e .\modules\tortoise-tts\
|
python -m pip install -e .\modules\tortoise-tts\
|
||||||
python -m pip install -r .\modules\dlas\requirements.txt
|
python -m pip install -r .\modules\dlas\requirements.txt
|
||||||
|
|
|
@ -7,7 +7,7 @@ python3 -m venv venv
|
||||||
source ./venv/bin/activate
|
source ./venv/bin/activate
|
||||||
python3 -m pip install --upgrade pip # just to be safe
|
python3 -m pip install --upgrade pip # just to be safe
|
||||||
# CUDA
|
# CUDA
|
||||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
# install requirements
|
# install requirements
|
||||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||||
|
|
|
@ -7,7 +7,7 @@ python3 -m venv venv
|
||||||
source ./venv/bin/activate
|
source ./venv/bin/activate
|
||||||
python3 -m pip install --upgrade pip # just to be safe
|
python3 -m pip install --upgrade pip # just to be safe
|
||||||
# ROCM
|
# ROCM
|
||||||
pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
||||||
# install requirements
|
# install requirements
|
||||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||||
|
|
|
@ -1,84 +0,0 @@
|
||||||
import asyncio
|
|
||||||
import json
|
|
||||||
from threading import Thread
|
|
||||||
|
|
||||||
from websockets.server import serve
|
|
||||||
|
|
||||||
from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
|
|
||||||
|
|
||||||
# this is a not so nice workaround to set values to None if their string value is "None"
|
|
||||||
def replaceNoneStringWithNone(message):
|
|
||||||
ignore_fields = ['text'] # list of fields which CAN have "None" as literal String value
|
|
||||||
|
|
||||||
for member in message:
|
|
||||||
if message[member] == 'None' and member not in ignore_fields:
|
|
||||||
message[member] = None
|
|
||||||
|
|
||||||
return message
|
|
||||||
|
|
||||||
|
|
||||||
async def _handle_generate(websocket, message):
|
|
||||||
# update args parameters which control the model settings
|
|
||||||
if message.get('autoregressive_model'):
|
|
||||||
update_autoregressive_model(message['autoregressive_model'])
|
|
||||||
|
|
||||||
if message.get('diffusion_model'):
|
|
||||||
update_diffusion_model(message['diffusion_model'])
|
|
||||||
|
|
||||||
if message.get('tokenizer_json'):
|
|
||||||
update_tokenizer(message['tokenizer_json'])
|
|
||||||
|
|
||||||
if message.get('sample_batch_size'):
|
|
||||||
global args
|
|
||||||
args.sample_batch_size = message['sample_batch_size']
|
|
||||||
|
|
||||||
message['result'] = generate(**message)
|
|
||||||
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
|
|
||||||
|
|
||||||
|
|
||||||
async def _handle_get_autoregressive_models(websocket, message):
|
|
||||||
message['result'] = get_autoregressive_models()
|
|
||||||
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
|
|
||||||
|
|
||||||
|
|
||||||
async def _handle_get_voice_list(websocket, message):
|
|
||||||
message['result'] = get_voice_list()
|
|
||||||
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
|
|
||||||
|
|
||||||
|
|
||||||
async def _handle_message(websocket, message):
|
|
||||||
message = replaceNoneStringWithNone(message)
|
|
||||||
|
|
||||||
if message.get('action') and message['action'] == 'generate':
|
|
||||||
await _handle_generate(websocket, message)
|
|
||||||
elif message.get('action') and message['action'] == 'get_voices':
|
|
||||||
await _handle_get_voice_list(websocket, message)
|
|
||||||
elif message.get('action') and message['action'] == 'get_autoregressive_models':
|
|
||||||
await _handle_get_autoregressive_models(websocket, message)
|
|
||||||
else:
|
|
||||||
print("websocket: undhandled message: " + message)
|
|
||||||
|
|
||||||
|
|
||||||
async def _handle_connection(websocket, path):
|
|
||||||
print("websocket: client connected")
|
|
||||||
|
|
||||||
async for message in websocket:
|
|
||||||
try:
|
|
||||||
await _handle_message(websocket, json.loads(message))
|
|
||||||
except ValueError:
|
|
||||||
print("websocket: malformed json received")
|
|
||||||
|
|
||||||
|
|
||||||
async def _run(host: str, port: int):
|
|
||||||
print(f"websocket: server started on ws://{host}:{port}")
|
|
||||||
|
|
||||||
async with serve(_handle_connection, host, port, ping_interval=None):
|
|
||||||
await asyncio.Future() # run forever
|
|
||||||
|
|
||||||
|
|
||||||
def _run_server(listen_address: str, port: int):
|
|
||||||
asyncio.run(_run(host=listen_address, port=port))
|
|
||||||
|
|
||||||
|
|
||||||
def start_websocket_server(listen_address: str, port: int):
|
|
||||||
Thread(target=_run_server, args=[listen_address, port], daemon=True).start()
|
|
|
@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
|
||||||
from utils import *
|
from utils import *
|
||||||
from webui import *
|
from webui import *
|
||||||
|
|
||||||
from api.websocket_server import start_websocket_server
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
args = setup_args()
|
args = setup_args()
|
||||||
|
|
||||||
|
@ -26,9 +23,6 @@ if __name__ == "__main__":
|
||||||
if not args.defer_tts_load:
|
if not args.defer_tts_load:
|
||||||
tts = load_tts()
|
tts = load_tts()
|
||||||
|
|
||||||
if args.websocket_enabled:
|
|
||||||
start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
|
|
||||||
|
|
||||||
webui.block_thread()
|
webui.block_thread()
|
||||||
elif __name__ == "main":
|
elif __name__ == "main":
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
@ -43,5 +37,4 @@ elif __name__ == "main":
|
||||||
app = gr.mount_gradio_app(app, webui, path=args.listen_path)
|
app = gr.mount_gradio_app(app, webui, path=args.listen_path)
|
||||||
|
|
||||||
if not args.defer_tts_load:
|
if not args.defer_tts_load:
|
||||||
tts = load_tts()
|
tts = load_tts()
|
||||||
|
|
7827
src/utils.py
7827
src/utils.py
File diff suppressed because it is too large
Load Diff
1953
src/webui.py
1953
src/webui.py
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user