13 changed files with 4845 additions and 5189 deletions
--- a/2
+++ b/2
@ -20,7 +20,7 @@ ENV PATH="$HOME/miniconda/bin:$PATH"
 RUN conda init
 RUN conda install python=$PYTHON_VERSION
 RUN python3 -m pip install --upgrade pip
-RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 RUN mkdir $HOME/ai-voice-cloning
 WORKDIR $HOME/ai-voice-cloning
--- a/README.md
+++ b/README.md
@ -1,9 +1,9 @@
 # AI Voice Cloning
 > **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
 This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
 Similar to my own findings for Stable Diffusion image generation, this rentry may appear a little disheveled as I note my new findings with TorToiSe. Please keep this in mind if the guide seems to shift a bit or sound confusing.
 >\>Ugh... why bother when I can just abuse 11.AI?
 You're more than welcome to, but TorToiSe is shaping up to be a very promising tool, especially with finetuning now on the horizon.
@ -16,4 +16,4 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for
 ## Bug Reporting
-If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
+If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
--- a/models/.template.valle.yaml
+++ b/models/.template.valle.yaml
@ -1,106 +1,13 @@
-dataset:
+data_dirs: [./training/${voice}/valle/]
-  training: [
+spkr_name_getter: "lambda p: p.parts[-3]" # "lambda p: p.parts[-1].split('-')[0]"
    "./training/${voice}/valle/",
  ]
  noise: [
    "./training/valle/data/Other/noise/",
  ]
  speaker_name_getter: "lambda p: p.parts[-3]" # "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
  use_hdf5: False
  hdf5_name: data.h5
  hdf5_flag: r
  validate: True
-  workers: 4
+max_phones: 72
  cache: False
-  phones_range: [4, 64]
+models: '${models}'
-  duration_range: [1.0, 8.0]
+batch_size: ${batch_size}
 gradient_accumulation_steps: ${gradient_accumulation_size}
 eval_batch_size: ${batch_size}
-  random_utterance: 1.0
+max_iter: ${iterations}
-  max_prompts: 3
+save_ckpt_every: ${save_rate}
-  prompt_duration: 3.0
+eval_every: ${validation_rate}
  sample_type: path
  tasks_list: ["tts"] # ["tts", "ns", "sr", "tse", "cse", "nse", "tts"]
 models:
  _max_levels: 8
  _models:
  - name: "ar"
    size: "full"
    resp_levels: 1
    prom_levels: 2
    tasks: 8
    arch_type: "retnet"
  - name: "nar"
    size: "full"
    resp_levels: 3
    prom_levels: 4
    tasks: 8
    arch_type: "retnet"
 hyperparameters:
  batch_size: ${batch_size}
  gradient_accumulation_steps: ${gradient_accumulation_size}
  gradient_clipping: 100
  optimizer: AdamW
  learning_rate: 1.0e-4
  scheduler_type: ""
 evaluation:
  batch_size: ${batch_size}
  frequency: ${validation_rate}
  size: 16
  steps: 300
  ar_temperature: 0.95
  nar_temperature: 0.25
 trainer:
  iterations: ${iterations}
  save_tag: step
  save_on_oom: True
  save_on_quit: True
  export_on_save: True
  export_on_quit: True
  save_frequency: ${save_rate}
  keep_last_checkpoints: 4
  aggressive_optimizations: False
  load_state_dict: True
  #strict_loading: False
  #load_tag: "9500"
  #load_states: False
  #restart_step_count: True
  gc_mode: None # "global_step"
  weight_dtype: bfloat16
  backend: deepspeed
  deepspeed:
    zero_optimization_level: 2
    use_compression_training: True
 inference:
  use_vocos: True
  normalize: False
  weight_dtype: float32
 bitsandbytes:
  enabled: False
  injects: True
  linear: True
  embedding: True
--- a/modules/tortoise-tts
+++ b/modules/tortoise-tts
@ -1 +1 @@
-Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
+Subproject commit 5ff00bf3bfa97e2c8e9f166b920273f83ac9d8f0
--- a/notebook_colab.ipynb
+++ b/notebook_colab.ipynb
@ -38,24 +38,10 @@
         ],
         "source":[
-            "!apt install python3.10-venv\n",
+            "!apt install python3.8-venv\n",
            "!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
            "%cd /content/ai-voice-cloning\n",
-            "# get local dependencies\n",
+            "!./setup-cuda.sh"
            "!git submodule init\n",
            "!git submodule update --remote\n",
            "# setup venv\n",
            "!python3 -m venv venv\n",
            "!source ./venv/bin/activate\n",
            "!python3 -m pip install --upgrade pip # just to be safe\n",
            "# CUDA\n",
            "!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
            "# install requirements\n",
            "!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
            "!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
            "!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
            "!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
            "!python3 -m pip install -r ./requirements.txt # install local requirements"
         ]
      },
      {
@ -129,8 +115,7 @@
         "cell_type":"code",
         "source":[
            "%cd /content/ai-voice-cloning/\n",
-            "!source ./venv/bin/activate\n",
+            "!./start.sh --share"
            "!python3 ./src/main.py --share"
         ],
         "metadata":{
            "id":"QRA8jF3cF-YJ"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118
+git+https://github.com/openai/whisper.git
 torch>=2.1.0
 torchvision
 torchaudio
 openai-whisper
 more-itertools
 ffmpeg-python
 gradio<=3.23.0
@ -11,7 +7,4 @@ music-tag
 voicefixer
 psutil
 phonemizer
-pydantic==1.10.11
+pydantic==1.10.11
 websockets
 beartype==0.15.0
 pykakasi
--- a/setup-cuda.bat
+++ b/setup-cuda.bat
@ -4,7 +4,7 @@ git submodule update --remote
 python -m venv venv
 call .\venv\Scripts\activate.bat
 python -m pip install --upgrade pip
-python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 python -m pip install -r .\modules\tortoise-tts\requirements.txt
 python -m pip install -e .\modules\tortoise-tts\
 python -m pip install -r .\modules\dlas\requirements.txt
--- a/setup-cuda.sh
+++ b/setup-cuda.sh
@ -7,7 +7,7 @@ python3 -m venv venv
 source ./venv/bin/activate
 python3 -m pip install --upgrade pip # just to be safe
 # CUDA
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 # install requirements
 python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
 python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
--- a/setup-rocm.sh
+++ b/setup-rocm.sh
@ -7,7 +7,7 @@ python3 -m venv venv
 source ./venv/bin/activate
 python3 -m pip install --upgrade pip # just to be safe
 # ROCM
-pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
+pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
 # install requirements
 python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
 python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
--- a/src/api/websocket_server.py
+++ b/src/api/websocket_server.py
@ -1,84 +0,0 @@
 import asyncio
 import json
 from threading import Thread
 from websockets.server import serve
 from utils import generate, get_autoregressive_models, get_voice_list, args, update_autoregressive_model, update_diffusion_model, update_tokenizer
 # this is a not so nice workaround to set values to None if their string value is "None"
 def replaceNoneStringWithNone(message):
    ignore_fields = ['text']  # list of fields which CAN have "None" as literal String value
    for member in message:
        if message[member] == 'None' and member not in ignore_fields:
            message[member] = None
    return message
 async def _handle_generate(websocket, message):
    # update args parameters which control the model settings
    if message.get('autoregressive_model'):
        update_autoregressive_model(message['autoregressive_model'])
    if message.get('diffusion_model'):
        update_diffusion_model(message['diffusion_model'])
    if message.get('tokenizer_json'):
        update_tokenizer(message['tokenizer_json'])
    if message.get('sample_batch_size'):
        global args
        args.sample_batch_size = message['sample_batch_size']
    message['result'] = generate(**message)
    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
 async def _handle_get_autoregressive_models(websocket, message):
    message['result'] = get_autoregressive_models()
    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
 async def _handle_get_voice_list(websocket, message):
    message['result'] = get_voice_list()
    await websocket.send(json.dumps(replaceNoneStringWithNone(message)))
 async def _handle_message(websocket, message):
    message = replaceNoneStringWithNone(message)
    if message.get('action') and message['action'] == 'generate':
        await _handle_generate(websocket, message)
    elif message.get('action') and message['action'] == 'get_voices':
        await _handle_get_voice_list(websocket, message)
    elif message.get('action') and message['action'] == 'get_autoregressive_models':
        await _handle_get_autoregressive_models(websocket, message)
    else:
        print("websocket: undhandled message: " + message)
 async def _handle_connection(websocket, path):
    print("websocket: client connected")
    async for message in websocket:
        try:
            await _handle_message(websocket, json.loads(message))
        except ValueError:
            print("websocket: malformed json received")
 async def _run(host: str, port: int):
    print(f"websocket: server started on ws://{host}:{port}")
    async with serve(_handle_connection, host, port, ping_interval=None):
        await asyncio.Future()  # run forever
 def _run_server(listen_address: str, port: int):
    asyncio.run(_run(host=listen_address, port=port))
 def start_websocket_server(listen_address: str, port: int):
    Thread(target=_run_server, args=[listen_address, port], daemon=True).start()
--- a/src/main.py
+++ b/src/main.py
@ -11,9 +11,6 @@ os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
 from utils import *
 from webui import *
 from api.websocket_server import start_websocket_server
 if __name__ == "__main__":
 	args = setup_args()
@ -26,9 +23,6 @@ if __name__ == "__main__":
 		if not args.defer_tts_load:
 			tts = load_tts()
 		if args.websocket_enabled:
 			start_websocket_server(args.websocket_listen_address, args.websocket_listen_port)
 		webui.block_thread()
 elif __name__ == "main":
 	from fastapi import FastAPI
@ -43,5 +37,4 @@ elif __name__ == "main":
 	app = gr.mount_gradio_app(app, webui, path=args.listen_path)
 	if not args.defer_tts_load:
-		tts = load_tts()
+		tts = load_tts()
--- a/src/utils.py
+++ b/src/utils.py
--- a/src/webui.py
+++ b/src/webui.py
		`@ -1 +1 @@`
			`Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43`				`Subproject commit 5ff00bf3bfa97e2c8e9f166b920273f83ac9d8f0`