8 changed files with 4806 additions and 5139 deletions
--- a/README.md
+++ b/README.md
@ -1,9 +1,9 @@
 # AI Voice Cloning

-> **Note** This project has been in dire need of being rewritten from the ground up for some time. Apologies for any crust from my rather spaghetti code.
-
 This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).

+Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
+
 >\>Ugh... why bother when I can just abuse 11.AI?

 You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
--- a/models/.template.valle.yaml
+++ b/models/.template.valle.yaml
@ -1,106 +1,13 @@
-dataset:
-  training: [
-    "./training/${voice}/valle/",
-  ]
-  noise: [
-    "./training/valle/data/Other/noise/",
-  ]
-  
-  speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
-  
-  use_hdf5: False
-  hdf5_name: data.h5
-  hdf5_flag: r
-  validate: True
+data_dirs: [./training/${voice}/valle/]
+spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"

-  workers: 4
-  cache: False
+max_phones: 72

-  phones_range: [4, 64]
-  duration_range: [1.0, 8.0]
+models: '${models}'
+batch_size: ${batch_size}
+gradient_accumulation_steps: ${gradient_accumulation_size}
+eval_batch_size: ${batch_size}

-  random_utterance: 1.0
-  max_prompts: 3
-  prompt_duration: 3.0
-
-  sample_type: path
-
-  tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
-
-models:
-  _max_levels: 8
-  _models:
-  - name: "ar"
-    size: "full"
-    resp_levels: 1
-    prom_levels: 2
-    tasks: 8
-    arch_type: "retnet"
-
-  - name: "nar"
-    size: "full"
-    resp_levels: 3
-    prom_levels: 4
-    tasks: 8
-    arch_type: "retnet"
-
-
-hyperparameters:
-  batch_size: ${batch_size}
-  gradient_accumulation_steps: ${gradient_accumulation_size}
-  gradient_clipping: 100
-  
-  optimizer: AdamW
-  learning_rate: 1.0e-4
-  
-  scheduler_type: ""
-
-evaluation:
-  batch_size: ${batch_size}
-  frequency: ${validation_rate}
-  size: 16
-  
-  steps: 300
-  ar_temperature: 0.95
-  nar_temperature: 0.25
-
-trainer:
-  iterations: ${iterations}
-  
-  save_tag: step
-  save_on_oom: True
-  save_on_quit: True
-  export_on_save: True
-  export_on_quit: True
-  save_frequency: ${save_rate}
-
-  keep_last_checkpoints: 4
-
-  aggressive_optimizations: False
-
-  load_state_dict: True
-  #strict_loading: False
-  #load_tag: "9500"
-  #load_states: False
-  #restart_step_count: True
-  
-  gc_mode: None # "global_step"
-
-  weight_dtype: bfloat16
-
-  backend: deepspeed
-  deepspeed:
-    zero_optimization_level: 2
-    use_compression_training: True
-
-inference:
-  use_vocos: True
-  normalize: False
-
-  weight_dtype: float32
-
-bitsandbytes:
-  enabled: False
-  injects: True
-  linear: True
-  embedding: True
+max_iter: ${iterations}
+save_ckpt_every: ${save_rate}
+eval_every: ${validation_rate}
--- a/modules/tortoise-tts
+++ b/modules/tortoise-tts
@ -1 +1 @@
-Subproject commit b10c58436d6871c26485d30b203e6cfdd4167602
+Subproject commit 5ff00bf3bfa97e2c8e9f166b920273f83ac9d8f0
--- a/requirements.txt
+++ b/requirements.txt
@ -7,5 +7,4 @@ music-tag
 voicefixer
 psutil
 phonemizer
-pydantic==1.10.11
-websockets
+pydantic==1.10.11
--- a/src/api/websocket_server.py
+++ b/src/api/websocket_server.py
@ -1,84 +0,0 @@
-import asyncio
-import json
-from threading import Thread
-
-from websockets.server import serve
-
-from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
-
-# this is a not so nice workaround to set values to None if their string value is "None"
-def replaceNoneStringWithNone(message):
-    ignore_fields = ['text']  # list of fields which CAN have "None" as literal String value
-
-    for member in message:
-        if message[member] == 'None' and member not in ignore_fields:
-            message[member] = None
-
-    return message
-
-
-async def _handle_generate(websocket, message):
-    # update args parameters which control the model settings
-    if message.get('autoregressive_model'):
-        update_autoregressive_model(message['autoregressive_model'])
-
-    if message.get('diffusion_model'):
-        update_diffusion_model(message['diffusion_model'])
-
-    if message.get('tokenizer_json'):
-        update_tokenizer(message['tokenizer_json'])
-
-    if message.get('sample_batch_size'):
-        global args
-        args.sample_batch_size = message['sample_batch_size']
-
-    message['result'] = generate(**message)
-    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
-
-
-async def _handle_get_autoregressive_models(websocket, message):
-    message['result'] = get_autoregressive_models()
-    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
-
-
-async def _handle_get_voice_list(websocket, message):
-    message['result'] = get_voice_list()
-    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
-
-
-async def _handle_message(websocket, message):
-    message = replaceNoneStringWithNone(message)
-
-    if message.get('action') and message['action'] == 'generate':
-        await _handle_generate(websocket, message)
-    elif message.get('action') and message['action'] == 'get_voices':
-        await _handle_get_voice_list(websocket, message)
-    elif message.get('action') and message['action'] == 'get_autoregressive_models':
-        await _handle_get_autoregressive_models(websocket, message)
-    else:
-        print("websocket: undhandled message: " + message)
-
-
-async def _handle_connection(websocket, path):
-    print("websocket: client connected")
-
-    async for message in websocket:
-        try:
-            await _handle_message(websocket, json.loads(message))
-        except ValueError:
-            print("websocket: malformed json received")
-
-
-async def _run(host: str, port: int):
-    print(f"websocket: server started on ws://{host}:{port}")
-
-    async with serve(_handle_connection, host, port, ping_interval=None):
-        await asyncio.Future()  # run forever
-
-
-def _run_server(listen_address: str, port: int):
-    asyncio.run(_run(host=listen_address, port=port))
-
-
-def start_websocket_server(listen_address: str, port: int):
-    Thread(target=_run_server, args=[listen_address, port], daemon=True).start()
--- a/src/main.py
+++ b/src/main.py
@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
 from utils import *
 from webui import *

-from api.websocket_server import start_websocket_server
-
-
 if __name__ == "__main__":
 	args = setup_args()

@ -26,9 +23,6 @@ if __name__ == "__main__":
 		if not args.defer_tts_load:
 			tts = load_tts()

-		if args.websocket_enabled:
-			start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
-
 		webui.block_thread()
 elif __name__ == "main":
 	from fastapi import FastAPI
@ -43,5 +37,4 @@ elif __name__ == "main":
 	app = gr.mount_gradio_app(app, webui, path=args.listen_path)

 	if not args.defer_tts_load:
-		tts = load_tts()
-
+		tts = load_tts()
--- a/src/utils.py
+++ b/src/utils.py
--- a/src/webui.py
+++ b/src/webui.py