huge success

2023-02-23 06:24:54 +07:00 · 2023-02-23 06:24:54 +07:00 · 225dee22d4
parent aa96edde2f
commit 225dee22d4
9 changed files with 134 additions and 67 deletions
--- a/README.md
+++ b/README.md
@ -16,4 +16,19 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for
 ## Bug Reporting
-If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first. Please don't hesitate to submit an issue.
+If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first. Please don't hesitate to submit an issue.
 ## Changelogs
 Below will be a rather-loose changelogss, as I don't think I have a way to chronicle them outside of commit messages:
 ### `2023.02.22`
 * greatly reduced VRAM consumption through the use of [TimDettmers/bitsandbytes](https://github.com/TimDettmers/bitsandbytes)
 * cleaned up section of code that handled parsing output from training script
 * added button to reconnect to the training script's output (sometimes skips a line to update, but it's better than nothing)
 * actually update submodules from the update script (somehow forgot to pass `--remote`)
 ### `Before 2023.02.22`
 Refer to commit logs.
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 6c284ef8ec4c4769de3181d90ac96ff63581ef55
+Subproject commit 0ef8ab6872813d1021d4d75e82b63377d28f5a06
--- a/models/.template.yaml
+++ b/models/.template.yaml
@ -2,7 +2,7 @@ name: ${name}
 model: extensibletrainer
 scale: 1
 gpu_ids: [0] # <-- unless you have multiple gpus, use this
-start_step: -1
+start_step: 0
 checkpointing_enabled: true  # <-- Gradient checkpointing. Enable for huge GPU memory savings. Disable for distributed training.
 fp16: ${float16} # might want to check this out
 wandb: false  # <-- enable to log to wandb. tensorboard logging is always enabled.
--- a/setup-cuda.bat
+++ b/setup-cuda.bat
@ -9,5 +9,8 @@ python -m pip install -r .\dlas\requirements.txt
 python -m pip install -r .\tortoise-tts\requirements.txt
 python -m pip install -r .\requirements.txt
 python -m pip install -e .\tortoise-tts\
 copy .\dlas\bitsandbytes_windows\* .\venv\Lib\site-packages\bitsandbytes\. /Y
 deactivate
 pause
--- a/src/train.py
+++ b/src/train.py
@ -1,8 +1,8 @@
 import torch
 import argparse
 import os
 import sys
 import argparse
 # this is some massive kludge that only works if it's called from a shell and not an import/PIP package
 # it's smart-yet-irritating module-model loader breaks when trying to load something specifically when not from a shell
@ -19,6 +19,17 @@ sys.path.insert(0, './dlas/')
 # don't even really bother trying to get DLAS PIP'd
 # without kludge, it'll have to be accessible as `codes` and not `dlas`
 import torch_intermediary
 # could just move this auto-toggle into the MITM script
 try:
    import bitsandbytes as bnb
    torch_intermediary.OVERRIDE_ADAM = True
    torch_intermediary.OVERRIDE_ADAMW = True
 except Exception as e:
    torch_intermediary.OVERRIDE_ADAM = False
    torch_intermediary.OVERRIDE_ADAMW = False
 import torch
 from codes import train as tr
 from utils import util, options as option
--- a/src/utils.py
+++ b/src/utils.py
@ -17,6 +17,7 @@ import urllib.request
 import signal
 import gc
 import subprocess
 import yaml
 import tqdm
 import torch
@ -26,6 +27,7 @@ import gradio as gr
 import gradio.utils
 from datetime import datetime
 from datetime import timedelta
 from tortoise.api import TextToSpeech, MODELS, get_model_path
 from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_dir
@ -42,7 +44,7 @@ tts_loading = False
 webui = None
 voicefixer = None
 whisper_model = None
-training_process = None
+training_state = None
 def generate(
@ -434,100 +436,128 @@ def compute_latents(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm
 	return voice
-def run_training(config_path, verbose=False, buffer_size=8, progress=gr.Progress(track_tqdm=True)):
+# superfluous, but it cleans up some things
-	global training_process
+class TrainingState():
-	
+	def __init__(self, config_path, buffer_size=8):
-	# I don't know if this is still necessary, as it was bitching at me for not doing this, despite it being in a separate process
+		self.cmd = ['train.bat', config_path] if os.name == "nt" else ['bash', './train.sh', config_path]
 	torch.multiprocessing.freeze_support()
-	unload_tts()
+		# parse config to get its iteration
-	unload_whisper()
+		with open(config_path, 'r') as file:
-	unload_voicefixer()
+			self.config = yaml.safe_load(file)
-	cmd = ['train.bat', config_path] if os.name == "nt" else ['bash', './train.sh', config_path]
+		self.it = 0
-	print("Spawning process: ", " ".join(cmd))
+		self.its = self.config['train']['niter']
 	training_process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
-	# parse config to get its iteration
+		self.checkpoint = 0
-	import yaml
+		self.checkpoints = int(self.its / self.config['logger']['save_checkpoint_freq'])
 	with open(config_path, 'r') as file:
 		config = yaml.safe_load(file)
-	it = 0
+		self.buffer = []
 	its = config['train']['niter']
-	checkpoint = 0
+		self.open_state = False
-	checkpoints = its / config['logger']['save_checkpoint_freq']
+		self.training_started = False
-	buffer_size = 8
+		self.info = {}
-	open_state = False
+		self.status = ""
 	training_started = False
-	yield " ".join(cmd)
+		self.it_rate = ""
-	
+		self.it_time_start = 0
-	info = {}
+		self.it_time_end = 0
-	buffer = []
+		self.eta = "?"
 	infos = []
 	yields = True
 	status = ""
-	it_rate = ""
+		print("Spawning process: ", " ".join(self.cmd))
-	it_time_start = 0
+		self.process = subprocess.Popen(self.cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
 	it_time_end = 0
-	for line in iter(training_process.stdout.readline, ""):
+	def parse(self, line, verbose=False, buffer_size=8, progress=None):
-		buffer.append(f'{line}')
+		self.buffer.append(f'{line}')
 		# rip out iteration info
-		if not training_started:
+		if not self.training_started:
 			if line.find('Start training from epoch') >= 0:
-				training_started = True
+				self.it_time_start = time.time()
 				self.training_started = True # could just leverage the above variable, but this is python, and there's no point in these aggressive microoptimizations
 				match = re.findall(r'iter: ([\d,]+)', line)
 				if match and len(match) > 0:
-					it = int(match[0].replace(",", ""))
+					self.it = int(match[0].replace(",", ""))
 		elif progress is not None:
 			if line.find('  0%|') == 0:
-				open_state = True
+				self.open_state = True
-			elif line.find('100%|') == 0 and open_state:
+			elif line.find('100%|') == 0 and self.open_state:
-				open_state = False
+				self.open_state = False
-				it = it + 1
+				self.it = self.it + 1
-				it_time_end = time.time()
+				self.it_time_end = time.time()
-				it_time_delta = it_time_end-it_time_start
+				self.it_time_delta = self.it_time_end-self.it_time_start
-				it_time_start = time.time()
+				self.it_time_start = time.time()
-				it_rate = f'[{"{:.3f}".format(it_time_delta)}s/it]' if it_time_delta >= 1 else f'[{"{:.3f}".format(1/it_time_delta)}it/s]' # I doubt anyone will have it/s rates, but its here
+				self.it_rate = f'[{"{:.3f}".format(self.it_time_delta)}s/it]' if self.it_time_delta >= 1 else f'[{"{:.3f}".format(1/self.it_time_delta)}it/s]' # I doubt anyone will have it/s rates, but its here
 				self.eta = (self.its - self.it) * self.it_time_delta
 				self.eta_hhmmss = str(timedelta(seconds=int(self.eta)))
-				progress(it / float(its), f'[{it}/{its}] {it_rate} Training... {status}')
+				progress(self.it / float(self.its), f'[{self.it}/{self.its}] [ETA: {self.eta_hhmmss}] {self.it_rate} Training... {self.status}')
 			if line.find('INFO: [epoch:') >= 0:
 				# easily rip out our stats...
 				match = re.findall(r'\b([a-z_0-9]+?)\b: ([0-9]\.[0-9]+?e[+-]\d+)\b', line)
 				if match and len(match) > 0:
 					for k, v in match:
-						info[k] = float(v)
+						self.info[k] = float(v)
 				# ...and returns our loss rate
 				# it would be nice for losses to be shown at every step
-				if 'loss_gpt_total' in info:
+				if 'loss_gpt_total' in self.info:
-					status = f"Total loss at step {int(info['step'])}: {info['loss_gpt_total']}"
+					# self.info['step'] returns the steps, not iterations, so we won't even bother ripping the reported step count, as iteration count won't get ripped from the regex
 					self.status = f"Total loss at iteration {self.it}: {self.info['loss_gpt_total']}"
 			elif line.find('Saving models and training states') >= 0:
-				checkpoint = checkpoint + 1
+				self.checkpoint = self.checkpoint + 1
-				progress(checkpoint / float(checkpoints), f'[{checkpoint}/{checkpoints}] Saving checkpoint...')
+				progress(self.checkpoint / float(self.checkpoints), f'[{self.checkpoint}/{self.checkpoints}] Saving checkpoint...')
-		print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}")
+		if verbose or not self.training_started:
 			return "".join(self.buffer[-buffer_size:])
-		if verbose or not training_started:
+def run_training(config_path, verbose=False, buffer_size=8, progress=gr.Progress(track_tqdm=True)):
-			yield "".join(buffer[-buffer_size:])
+	global training_state
 	if training_state and training_state.process:
 		return "Training already in progress"
 	# I don't know if this is still necessary, as it was bitching at me for not doing this, despite it being in a separate process
 	torch.multiprocessing.freeze_support()
 	unload_tts()
 	unload_whisper()
 	unload_voicefixer()
 	training_state = TrainingState(config_path=config_path, buffer_size=buffer_size)
-	training_process.stdout.close()
+	for line in iter(training_state.process.stdout.readline, ""):
-	return_code = training_process.wait()
+		print(f"[Training] [{datetime.now().isoformat()}] {line[:-1]}")
-	training_process = None
+		
 		res = training_state.parse( line=line, verbose=verbose, buffer_size=buffer_size, progress=progress )
 		if res:
 			yield res
 	training_state.process.stdout.close()
 	return_code = training_state.process.wait()
 	output = "".join(training_state.buffer[-buffer_size:])
 	training_state = None
 	#if return_code:
 	#	raise subprocess.CalledProcessError(return_code, cmd)
-	return "".join(buffer[-buffer_size:])
+	return output
 def reconnect_training(config_path, verbose=False, buffer_size=8, progress=gr.Progress(track_tqdm=True)):
 	global training_state
 	if not training_state or not training_state.process:
 		return "Training not in progress"
 	for line in iter(training_state.process.stdout.readline, ""):
 		res = training_state.parse( line=line, verbose=verbose, buffer_size=buffer_size, progress=progress )
 		if res:
 			yield res
 	output = "".join(training_state.buffer[-buffer_size:])
 	return output
 def stop_training():
 	global training_process
--- a/src/webui.py
+++ b/src/webui.py
@ -410,6 +410,7 @@ def setup_gradio():
 						refresh_configs = gr.Button(value="Refresh Configurations")
 						start_training_button = gr.Button(value="Train")
 						stop_training_button = gr.Button(value="Stop")
 						reconnect_training_button = gr.Button(value="Reconnect")
 					with gr.Column():
 						training_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
 						verbose_training = gr.Checkbox(label="Verbose Console Output")
@ -614,6 +615,13 @@ def setup_gradio():
 			inputs=None,
 			outputs=training_output #console_output
 		)
 		reconnect_training_button.click(reconnect_training,
 			inputs=[
 				verbose_training,
 				training_buffer_size,
 			],
 			outputs=training_output #console_output
 		)
 		prepare_dataset_button.click(
 			prepare_dataset_proxy,
 			inputs=dataset_settings,
--- a/update.bat
+++ b/update.bat
@ -1,5 +1,5 @@
 git pull
-git submodule update
+git submodule update --remote
 python -m venv venv
 call .\venv\Scripts\activate.bat
--- a/update.sh
+++ b/update.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 git pull
-git submodule update
+git submodule update --remote
 python3 -m venv venv
 source ./venv/bin/activate
		`@ -1 +1 @@`
			`Subproject commit 6c284ef8ec4c4769de3181d90ac96ff63581ef55`				`Subproject commit 0ef8ab6872813d1021d4d75e82b63377d28f5a06`