19 changed files with 4217 additions and 5330 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +0,0 @@
-/models
-/training
-/voices
-/bin
--- a/37
+++ b/37
@ -1,37 +0,0 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
-
-ARG DEBIAN_FRONTEND=noninteractive
-ARG TZ=UTC
-ARG MINICONDA_VERSION=23.1.0-1
-ARG PYTHON_VERSION=3.9.13
-
-RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-RUN apt-get update
-RUN apt install -y curl wget git ffmpeg
-RUN adduser --disabled-password --gecos '' --shell /bin/bash user
-USER user
-ENV HOME=/home/user
-WORKDIR $HOME
-RUN mkdir $HOME/.cache $HOME/.config && chmod -R 777 $HOME
-RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
-RUN chmod +x Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh
-RUN ./Miniconda3-py39_$MINICONDA_VERSION-Linux-x86_64.sh -b -p /home/user/miniconda
-ENV PATH="$HOME/miniconda/bin:$PATH"
-RUN conda init
-RUN conda install python=$PYTHON_VERSION
-RUN python3 -m pip install --upgrade pip
-RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
-
-RUN mkdir $HOME/ai-voice-cloning
-WORKDIR $HOME/ai-voice-cloning
-COPY --chown=user:user modules modules
-
-RUN python3 -m pip install -r ./modules/tortoise-tts/requirements.txt
-RUN python3 -m pip install -e ./modules/tortoise-tts/
-RUN python3 -m pip install -r ./modules/dlas/requirements.txt
-RUN python3 -m pip install -e ./modules/dlas/
-ADD requirements.txt requirements.txt
-RUN python3 -m pip install -r ./requirements.txt
-ADD --chown=user:user . $HOME/ai-voice-cloning
-
-CMD ["python", "./src/main.py", "--listen", "0.0.0.0:7680"]
--- a/README.md
+++ b/README.md
@ -1,9 +1,9 @@
 # AI Voice Cloning

-> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
-
 This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).

+Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
+
 >\>Ugh... why bother when I can just abuse 11.AI?

 You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
@ -16,4 +16,4 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for

 ## Bug Reporting

-If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
+If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
--- a/models/.template.valle.yaml
+++ b/models/.template.valle.yaml
@ -1,106 +1,13 @@
-dataset:
-  training: [
-    "./training/${voice}/valle/",
-  ]
-  noise: [
-    "./training/valle/data/Other/noise/",
-  ]
-  
-  speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
-  
-  use_hdf5: False
-  hdf5_name: data.h5
-  hdf5_flag: r
-  validate: True
+data_dirs: [./training/${voice}/valle/]
+spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"

-  workers: 4
-  cache: False
+max_phones: 72

-  phones_range: [4, 64]
-  duration_range: [1.0, 8.0]
+models: '${models}'
+batch_size: ${batch_size}
+gradient_accumulation_steps: ${gradient_accumulation_size}
+eval_batch_size: ${batch_size}

-  random_utterance: 1.0
-  max_prompts: 3
-  prompt_duration: 3.0
-
-  sample_type: path
-
-  tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
-
-models:
-  _max_levels: 8
-  _models:
-  - name: "ar"
-    size: "full"
-    resp_levels: 1
-    prom_levels: 2
-    tasks: 8
-    arch_type: "retnet"
-
-  - name: "nar"
-    size: "full"
-    resp_levels: 3
-    prom_levels: 4
-    tasks: 8
-    arch_type: "retnet"
-
-
-hyperparameters:
-  batch_size: ${batch_size}
-  gradient_accumulation_steps: ${gradient_accumulation_size}
-  gradient_clipping: 100
-  
-  optimizer: AdamW
-  learning_rate: 1.0e-4
-  
-  scheduler_type: ""
-
-evaluation:
-  batch_size: ${batch_size}
-  frequency: ${validation_rate}
-  size: 16
-  
-  steps: 300
-  ar_temperature: 0.95
-  nar_temperature: 0.25
-
-trainer:
-  iterations: ${iterations}
-  
-  save_tag: step
-  save_on_oom: True
-  save_on_quit: True
-  export_on_save: True
-  export_on_quit: True
-  save_frequency: ${save_rate}
-
-  keep_last_checkpoints: 4
-
-  aggressive_optimizations: False
-
-  load_state_dict: True
-  #strict_loading: False
-  #load_tag: "9500"
-  #load_states: False
-  #restart_step_count: True
-  
-  gc_mode: None # "global_step"
-
-  weight_dtype: bfloat16
-
-  backend: deepspeed
-  deepspeed:
-    zero_optimization_level: 2
-    use_compression_training: True
-
-inference:
-  use_vocos: True
-  normalize: False
-
-  weight_dtype: float32
-
-bitsandbytes:
-  enabled: False
-  injects: True
-  linear: True
-  embedding: True
+max_iter: ${iterations}
+save_ckpt_every: ${save_rate}
+eval_every: ${validation_rate}
--- a/modules/tortoise-tts
+++ b/modules/tortoise-tts
@ -1 +1 @@
-Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
+Subproject commit 0bcdf81d0444218b4dedaefa5c546d42f36b8130
--- a/notebook_colab.ipynb
+++ b/notebook_colab.ipynb
@ -38,24 +38,10 @@
            
         ],
         "source":[
-            "!apt install python3.10-venv\n",
+            "!apt install python3.8-venv\n",
            "!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
            "%cd /content/ai-voice-cloning\n",
-            "# get local dependencies\n",
-            "!git submodule init\n",
-            "!git submodule update --remote\n",
-            "# setup venv\n",
-            "!python3 -m venv venv\n",
-            "!source ./venv/bin/activate\n",
-            "!python3 -m pip install --upgrade pip # just to be safe\n",
-            "# CUDA\n",
-            "!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
-            "# install requirements\n",
-            "!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
-            "!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
-            "!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
-            "!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
-            "!python3 -m pip install -r ./requirements.txt # install local requirements"
+            "!./setup-cuda.sh"
         ]
      },
      {
@ -129,8 +115,7 @@
         "cell_type":"code",
         "source":[
            "%cd /content/ai-voice-cloning/\n",
-            "!source ./venv/bin/activate\n",
-            "!python3 ./src/main.py --share"
+            "!./start.sh --share"
         ],
         "metadata":{
            "id":"QRA8jF3cF-YJ"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,17 +1,9 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch>=2.1.0
-torchvision
-torchaudio
+git+https://github.com/openai/whisper.git

-openai-whisper
 more-itertools
 ffmpeg-python
-gradio<=3.23.0
+gradio
 music-tag
 voicefixer
 psutil
 phonemizer
-pydantic==1.10.11
-websockets
-beartype==0.15.0
-pykakasi
--- a/setup-cuda.bat
+++ b/setup-cuda.bat
@ -4,7 +4,7 @@ git submodule update --remote
 python -m venv venv
 call .\venv\Scripts\activate.bat
 python -m pip install --upgrade pip
-python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 python -m pip install -r .\modules\tortoise-tts\requirements.txt
 python -m pip install -e .\modules\tortoise-tts\
 python -m pip install -r .\modules\dlas\requirements.txt
--- a/setup-cuda.sh
+++ b/setup-cuda.sh
@ -7,7 +7,7 @@ python3 -m venv venv
 source ./venv/bin/activate
 python3 -m pip install --upgrade pip # just to be safe
 # CUDA
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 # install requirements
 python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
 python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
--- a/setup-directml.bat
+++ b/setup-directml.bat
@ -4,7 +4,7 @@ git submodule update --remote
 python -m venv venv
 call .\venv\Scripts\activate.bat
 python -m pip install --upgrade pip
-python -m pip install torch torchvision torchaudio torch-directml
+python -m pip install torch==1.13.1 torchvision torchaudio torch-directml
 python -m pip install -r .\modules\tortoise-tts\requirements.txt
 python -m pip install -e .\modules\tortoise-tts\
 python -m pip install -r .\modules\dlas\requirements.txt
--- a/setup-docker.sh
+++ b/setup-docker.sh
@ -1,4 +0,0 @@
-#!/bin/bash
-git submodule init
-git submodule update --remote
-docker build -t ai-voice-cloning .
--- a/setup-rocm.sh
+++ b/setup-rocm.sh
@ -7,7 +7,7 @@ python3 -m venv venv
 source ./venv/bin/activate
 python3 -m pip install --upgrade pip # just to be safe
 # ROCM
-pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
+pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
 # install requirements
 python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
 python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
--- a/src/api/websocket_server.py
+++ b/src/api/websocket_server.py
@ -1,84 +0,0 @@
-import asyncio
-import json
-from threading import Thread
-
-from websockets.server import serve
-
-from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
-
-# this is a not so nice workaround to set values to None if their string value is "None"
-def replaceNoneStringWithNone(message):
-    ignore_fields = ['text']  # list of fields which CAN have "None" as literal String value
-
-    for member in message:
-        if message[member] == 'None' and member not in ignore_fields:
-            message[member] = None
-
-    return message
-
-
-async def _handle_generate(websocket, message):
-    # update args parameters which control the model settings
-    if message.get('autoregressive_model'):
-        update_autoregressive_model(message['autoregressive_model'])
-
-    if message.get('diffusion_model'):
-        update_diffusion_model(message['diffusion_model'])
-
-    if message.get('tokenizer_json'):
-        update_tokenizer(message['tokenizer_json'])
-
-    if message.get('sample_batch_size'):
-        global args
-        args.sample_batch_size = message['sample_batch_size']
-
-    message['result'] = generate(**message)
-    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
-
-
-async def _handle_get_autoregressive_models(websocket, message):
-    message['result'] = get_autoregressive_models()
-    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
-
-
-async def _handle_get_voice_list(websocket, message):
-    message['result'] = get_voice_list()
-    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
-
-
-async def _handle_message(websocket, message):
-    message = replaceNoneStringWithNone(message)
-
-    if message.get('action') and message['action'] == 'generate':
-        await _handle_generate(websocket, message)
-    elif message.get('action') and message['action'] == 'get_voices':
-        await _handle_get_voice_list(websocket, message)
-    elif message.get('action') and message['action'] == 'get_autoregressive_models':
-        await _handle_get_autoregressive_models(websocket, message)
-    else:
-        print("websocket: undhandled message: " + message)
-
-
-async def _handle_connection(websocket, path):
-    print("websocket: client connected")
-
-    async for message in websocket:
-        try:
-            await _handle_message(websocket, json.loads(message))
-        except ValueError:
-            print("websocket: malformed json received")
-
-
-async def _run(host: str, port: int):
-    print(f"websocket: server started on ws://{host}:{port}")
-
-    async with serve(_handle_connection, host, port, ping_interval=None):
-        await asyncio.Future()  # run forever
-
-
-def _run_server(listen_address: str, port: int):
-    asyncio.run(_run(host=listen_address, port=port))
-
-
-def start_websocket_server(listen_address: str, port: int):
-    Thread(target=_run_server, args=[listen_address, port], daemon=True).start()
--- a/src/cli.py
+++ b/src/cli.py
@ -1,66 +0,0 @@
-import os
-import argparse
-
-if 'TORTOISE_MODELS_DIR' not in os.environ:
-	os.environ['TORTOISE_MODELS_DIR'] = os.path.realpath(os.path.join(os.getcwd(), './models/tortoise/'))
-
-if 'TRANSFORMERS_CACHE' not in os.environ:
-	os.environ['TRANSFORMERS_CACHE'] = os.path.realpath(os.path.join(os.getcwd(), './models/transformers/'))
-
-os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
-
-from utils import *
-
-if __name__ == "__main__":
-	args = setup_args(cli=True)
-
-	default_arguments = import_generate_settings()
-	parser = argparse.ArgumentParser(allow_abbrev=False)
-	parser.add_argument("--text", default=default_arguments['text'])
-	parser.add_argument("--delimiter", default=default_arguments['delimiter'])
-	parser.add_argument("--emotion", default=default_arguments['emotion'])
-	parser.add_argument("--prompt", default=default_arguments['prompt'])
-	parser.add_argument("--voice", default=default_arguments['voice'])
-	parser.add_argument("--mic_audio", default=default_arguments['mic_audio'])
-	parser.add_argument("--voice_latents_chunks", default=default_arguments['voice_latents_chunks'])
-	parser.add_argument("--candidates", default=default_arguments['candidates'])
-	parser.add_argument("--seed", default=default_arguments['seed'])
-	parser.add_argument("--num_autoregressive_samples", default=default_arguments['num_autoregressive_samples'])
-	parser.add_argument("--diffusion_iterations", default=default_arguments['diffusion_iterations'])
-	parser.add_argument("--temperature", default=default_arguments['temperature'])
-	parser.add_argument("--diffusion_sampler", default=default_arguments['diffusion_sampler'])
-	parser.add_argument("--breathing_room", default=default_arguments['breathing_room'])
-	parser.add_argument("--cvvp_weight", default=default_arguments['cvvp_weight'])
-	parser.add_argument("--top_p", default=default_arguments['top_p'])
-	parser.add_argument("--diffusion_temperature", default=default_arguments['diffusion_temperature'])
-	parser.add_argument("--length_penalty", default=default_arguments['length_penalty'])
-	parser.add_argument("--repetition_penalty", default=default_arguments['repetition_penalty'])
-	parser.add_argument("--cond_free_k", default=default_arguments['cond_free_k'])
-
-	args, unknown = parser.parse_known_args()
-	kwargs = {
-		'text': args.text,
-		'delimiter': args.delimiter,
-		'emotion': args.emotion,
-		'prompt': args.prompt,
-		'voice': args.voice,
-		'mic_audio': args.mic_audio,
-		'voice_latents_chunks': args.voice_latents_chunks,
-		'candidates': args.candidates,
-		'seed': args.seed,
-		'num_autoregressive_samples': args.num_autoregressive_samples,
-		'diffusion_iterations': args.diffusion_iterations,
-		'temperature': args.temperature,
-		'diffusion_sampler': args.diffusion_sampler,
-		'breathing_room': args.breathing_room,
-		'cvvp_weight': args.cvvp_weight,
-		'top_p': args.top_p,
-		'diffusion_temperature': args.diffusion_temperature,
-		'length_penalty': args.length_penalty,
-		'repetition_penalty': args.repetition_penalty,
-		'cond_free_k': args.cond_free_k,
-		'experimentals': default_arguments['experimentals'],
-	}
-
-	tts = load_tts()
-	generate(**kwargs)
--- a/src/main.py
+++ b/src/main.py
@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
 from utils import *
 from webui import *

-from api.websocket_server import start_websocket_server
-
-
 if __name__ == "__main__":
 	args = setup_args()

@ -26,9 +23,6 @@ if __name__ == "__main__":
 		if not args.defer_tts_load:
 			tts = load_tts()

-		if args.websocket_enabled:
-			start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
-
 		webui.block_thread()
 elif __name__ == "main":
 	from fastapi import FastAPI
@ -43,5 +37,4 @@ elif __name__ == "main":
 	app = gr.mount_gradio_app(app, webui, path=args.listen_path)

 	if not args.defer_tts_load:
-		tts = load_tts()
-
+		tts = load_tts()
--- a/src/utils.py
+++ b/src/utils.py
--- a/src/webui.py
+++ b/src/webui.py
--- a/start-docker.sh
+++ b/start-docker.sh
@ -1,14 +0,0 @@
-#!/bin/bash
-CMD="python3 ./src/main.py $@"
-# CMD="bash"
-CPATH="/home/user/ai-voice-cloning"
-docker run --rm --gpus all \
-    --mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
-    --mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
-    --mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
-    --mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
-    --workdir $CPATH \
-    --user "$(id -u):$(id -g)" \
-    --net host \
-    -it ai-voice-cloning $CMD
-
--- a/train-docker.sh
+++ b/train-docker.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-CMD="python3 ./src/train.py --yaml $1"
-# ipc host is one way to increase the shared memory for the container
-# more info here https://github.com/pytorch/pytorch#docker-image
-CPATH="/home/user/ai-voice-cloning"
-docker run --rm --gpus all \
-    --mount "type=bind,src=$PWD/models,dst=$CPATH/models" \
-    --mount "type=bind,src=$PWD/training,dst=$CPATH/training" \
-    --mount "type=bind,src=$PWD/voices,dst=$CPATH/voices" \
-    --mount "type=bind,src=$PWD/bin,dst=$CPATH/bin" \
-    --mount "type=bind,src=$PWD/src,dst=$CPATH/src" \
-    --workdir $CPATH \
-    --ipc host \
-    --user "$(id -u):$(id -g)" \
-    -it ai-voice-cloning $CMD