Compare commits

..

No commits in common. "5f80ee9b38aec72595c9cda1c4816a70f224943e" and "350d2d5a95dec8633cd11fd605633dddd5de24e1" have entirely different histories.

8 changed files with 4806 additions and 5139 deletions

View File

@ -1,9 +1,9 @@
# AI Voice Cloning
> **Note** This project has been in dire need of being rewritten from the ground up for some time. Apologies for any crust from my rather spaghetti code.
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
>\>Ugh... why bother when I can just abuse 11.AI?
You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.

View File

@ -1,106 +1,13 @@
dataset:
training: [
"./training/${voice}/valle/",
]
noise: [
"./training/valle/data/Other/noise/",
]
speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
use_hdf5: False
hdf5_name: data.h5
hdf5_flag: r
validate: True
data_dirs: [./training/${voice}/valle/]
spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"
workers: 4
cache: False
max_phones: 72
phones_range: [4, 64]
duration_range: [1.0, 8.0]
models: '${models}'
batch_size: ${batch_size}
gradient_accumulation_steps: ${gradient_accumulation_size}
eval_batch_size: ${batch_size}
random_utterance: 1.0
max_prompts: 3
prompt_duration: 3.0
sample_type: path
tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
models:
_max_levels: 8
_models:
- name: "ar"
size: "full"
resp_levels: 1
prom_levels: 2
tasks: 8
arch_type: "retnet"
- name: "nar"
size: "full"
resp_levels: 3
prom_levels: 4
tasks: 8
arch_type: "retnet"
hyperparameters:
batch_size: ${batch_size}
gradient_accumulation_steps: ${gradient_accumulation_size}
gradient_clipping: 100
optimizer: AdamW
learning_rate: 1.0e-4
scheduler_type: ""
evaluation:
batch_size: ${batch_size}
frequency: ${validation_rate}
size: 16
steps: 300
ar_temperature: 0.95
nar_temperature: 0.25
trainer:
iterations: ${iterations}
save_tag: step
save_on_oom: True
save_on_quit: True
export_on_save: True
export_on_quit: True
save_frequency: ${save_rate}
keep_last_checkpoints: 4
aggressive_optimizations: False
load_state_dict: True
#strict_loading: False
#load_tag: "9500"
#load_states: False
#restart_step_count: True
gc_mode: None # "global_step"
weight_dtype: bfloat16
backend: deepspeed
deepspeed:
zero_optimization_level: 2
use_compression_training: True
inference:
use_vocos: True
normalize: False
weight_dtype: float32
bitsandbytes:
enabled: False
injects: True
linear: True
embedding: True
max_iter: ${iterations}
save_ckpt_every: ${save_rate}
eval_every: ${validation_rate}

@ -1 +1 @@
Subproject commit b10c58436d6871c26485d30b203e6cfdd4167602
Subproject commit 5ff00bf3bfa97e2c8e9f166b920273f83ac9d8f0

View File

@ -7,5 +7,4 @@ music-tag
voicefixer
psutil
phonemizer
pydantic==1.10.11
websockets
pydantic==1.10.11

View File

@ -1,84 +0,0 @@
import asyncio
import json
from threading import Thread
from websockets.server import serve
from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
# this is a not so nice workaround to set values to None if their string value is "None"
def replaceNoneStringWithNone(message):
ignore_fields = ['text'] # list of fields which CAN have "None" as literal String value
for member in message:
if message[member] == 'None' and member not in ignore_fields:
message[member] = None
return message
async def _handle_generate(websocket, message):
# update args parameters which control the model settings
if message.get('autoregressive_model'):
update_autoregressive_model(message['autoregressive_model'])
if message.get('diffusion_model'):
update_diffusion_model(message['diffusion_model'])
if message.get('tokenizer_json'):
update_tokenizer(message['tokenizer_json'])
if message.get('sample_batch_size'):
global args
args.sample_batch_size = message['sample_batch_size']
message['result'] = generate(**message)
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_get_autoregressive_models(websocket, message):
message['result'] = get_autoregressive_models()
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_get_voice_list(websocket, message):
message['result'] = get_voice_list()
await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
async def _handle_message(websocket, message):
message = replaceNoneStringWithNone(message)
if message.get('action') and message['action'] == 'generate':
await _handle_generate(websocket, message)
elif message.get('action') and message['action'] == 'get_voices':
await _handle_get_voice_list(websocket, message)
elif message.get('action') and message['action'] == 'get_autoregressive_models':
await _handle_get_autoregressive_models(websocket, message)
else:
print("websocket: undhandled message: " + message)
async def _handle_connection(websocket, path):
print("websocket: client connected")
async for message in websocket:
try:
await _handle_message(websocket, json.loads(message))
except ValueError:
print("websocket: malformed json received")
async def _run(host: str, port: int):
print(f"websocket: server started on ws://{host}:{port}")
async with serve(_handle_connection, host, port, ping_interval=None):
await asyncio.Future() # run forever
def _run_server(listen_address: str, port: int):
asyncio.run(_run(host=listen_address, port=port))
def start_websocket_server(listen_address: str, port: int):
Thread(target=_run_server, args=[listen_address, port], daemon=True).start()

View File

@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
from utils import *
from webui import *
from api.websocket_server import start_websocket_server
if __name__ == "__main__":
args = setup_args()
@ -26,9 +23,6 @@ if __name__ == "__main__":
if not args.defer_tts_load:
tts = load_tts()
if args.websocket_enabled:
start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
webui.block_thread()
elif __name__ == "main":
from fastapi import FastAPI
@ -43,5 +37,4 @@ elif __name__ == "main":
app = gr.mount_gradio_app(app, webui, path=args.listen_path)
if not args.defer_tts_load:
tts = load_tts()
tts = load_tts()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff