From f657f30e2bb43edbd81979b774f54c85e7574cf4 Mon Sep 17 00:00:00 2001
From: yqxtqymn <yqxtqymn@noreply.localhost>
Date: Mon, 6 Mar 2023 01:59:58 +0000
Subject: [PATCH] Update 'src/utils.py'

whisper->whisperx
---
 src/utils.py | 333 ++++++++++++++++-----------------------------------
 1 file changed, 100 insertions(+), 233 deletions(-)

diff --git a/src/utils.py b/src/utils.py
index 9e6c807..1d611c1 100755
--- a/src/utils.py
+++ b/src/utils.py
@@ -28,6 +28,7 @@ import music_tag
 import gradio as gr
 import gradio.utils
 import pandas as pd
+import whisperx
 
 from datetime import datetime
 from datetime import timedelta
@@ -40,7 +41,6 @@ from tortoise.utils.device import get_device_name, set_device_name
 MODELS[
     'dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
 WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v2"]
-WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
 EPOCH_SCHEDULE = [9, 18, 25, 33]
 
 args = None
@@ -332,17 +332,17 @@ def generate(
     }
 
     """
-	# kludgy yucky codesmells
-	for name in audio_cache:
-		if 'output' not in audio_cache[name]:
-			continue
+    # kludgy yucky codesmells
+    for name in audio_cache:
+        if 'output' not in audio_cache[name]:
+            continue
 
-		#output_voices.append(f'{outdir}/{voice}_{name}.wav')
-		output_voices.append(name)
-		if not args.embed_output_metadata:
-			with open(f'{outdir}/{voice}_{name}.json', 'w', encoding="utf-8") as f:
-				f.write(json.dumps(info, indent='\t') )
-	"""
+        #output_voices.append(f'{outdir}/{voice}_{name}.wav')
+        output_voices.append(name)
+        if not args.embed_output_metadata:
+            with open(f'{outdir}/{voice}_{name}.json', 'w', encoding="utf-8") as f:
+                f.write(json.dumps(info, indent='\t') )
+    """
 
     if args.voice_fixer:
         if not voicefixer:
@@ -943,13 +943,6 @@ def run_training(config_path, verbose=False, gpus=1, keep_x_past_datasets=0, pro
         training_state = None
 
 
-def get_training_losses():
-    global training_state
-    if not training_state or not training_state.statistics:
-        return
-    return pd.DataFrame(training_state.statistics)
-
-
 def update_training_dataplot(config_path=None):
     global training_state
     update = None
@@ -958,12 +951,17 @@ def update_training_dataplot(config_path=None):
         if config_path:
             training_state = TrainingState(config_path=config_path, start=False)
             if training_state.statistics:
-                update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics))
+                update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics),
+                                            x_lim=[0, training_state.its], x="step", y="value",
+                                            title="Training Metrics", color="type", tooltip=['step', 'value', 'type'],
+                                            width=600, height=350, )
             del training_state
             training_state = None
     elif training_state.statistics:
         training_state.load_losses()
-        update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics))
+        update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics), x_lim=[0, training_state.its],
+                                    x="step", y="value", title="Training Metrics", color="type",
+                                    tooltip=['step', 'value', 'type'], width=600, height=350, )
 
     return update
 
@@ -1030,231 +1028,108 @@ def convert_to_halfp():
 
 
 def prepare_dataset(files, outdir, language=None, progress=None):
-	unload_tts()
+    unload_tts()
 
-	global whisper_model
-	import whisperx
+    global whisper_model
+    if whisper_model is None:
+        load_whisper_model()
 
-	device = "cuda"  # add cpu option?
+    os.makedirs(outdir, exist_ok=True)
 
-	# original whisper https://github.com/openai/whisper
-	# whisperx fork https://github.com/m-bain/whisperX
-	# supports en, fr, de, es, it, ja, zh, nl, uk, pt
+    idx = 0
+    results = {}
+    transcription = []
 
-	# tiny, base, small, medium, large, large-v2
+    idx = 0
+    results = {}
+    transcription = []
 
-	whisper_model = whisperx.load_model("medium", device)
-	# some additional model features require huggingface token
+    if (torch.cuda.is_available()):
+        device = "cuda"
+    else:
+        device = "cpu"
 
-	os.makedirs(outdir, exist_ok=True)
+    for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
+        print(f"Transcribing file: {file}")
 
-	idx = 0
-	results = {}
-	transcription = []
+        result = whisper_model.transcribe(file)
 
-	for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
-		print(f"Transcribing file: {file}")
+        print(result["segments"])  # before alignment
 
-		result = whisper_model.transcribe(file)
+        # load alignment model and metadata
+        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
 
-		print(result["segments"])  # before alignment
+        # align whisper output
+        result_aligned = whisperx.align(result["segments"], model_a, metadata, file, device)
 
-		# load alignment model and metadata
-		model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        print(result_aligned["segments"])  # after alignment
+        print(result_aligned["word_segments"])  # after alignment
 
-		# align whisper output
-		result_aligned = whisperx.align(result["segments"], model_a, metadata, file, device)
+        results[os.path.basename(file)] = result
 
-		print(result_aligned["segments"])  # after alignment
-		print(result_aligned["word_segments"])  # after alignment
+        print(f"Transcribed file: {file}, {len(result['segments'])} found.")
 
-		results[os.path.basename(file)] = result
+        waveform, sampling_rate = torchaudio.load(file)
+        num_channels, num_frames = waveform.shape
 
-		print(f"Transcribed file: {file}, {len(result['segments'])} found.")
+        for segment in result[
+            'segments']:  # enumerate_progress(result['segments'], desc="Segmenting voice file", progress=progress):
+            start = int(segment['start'] * sampling_rate)
+            end = int(segment['end'] * sampling_rate)
 
-		waveform, sampling_rate = torchaudio.load(file)
-		num_channels, num_frames = waveform.shape
+            sliced_waveform = waveform[:, start:end]
+            sliced_name = f"{pad(idx, 4)}.wav"
 
-		for segment in result[
-			'segments']:  # enumerate_progress(result['segments'], desc="Segmenting voice file", progress=progress):
-			start = int(segment['start'] * sampling_rate)
-			end = int(segment['end'] * sampling_rate)
+            torchaudio.save(f"{outdir}/{sliced_name}", sliced_waveform, sampling_rate)
 
-			sliced_waveform = waveform[:, start:end]
-			sliced_name = f"{pad(idx, 4)}.wav"
+            idx = idx + 1
+            line = f"{sliced_name}|{segment['text'].strip()}"
+            transcription.append(line)
+            with open(f'{outdir}/train.txt', 'a', encoding="utf-8") as f:
+                f.write(f'{line}\n')
 
-			torchaudio.save(f"{outdir}/{sliced_name}", sliced_waveform, sampling_rate)
+    '''for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
+        basename = os.path.basename(file)
+        result = whisper_transcribe(file, language=language)
+        results[basename] = result
+        print(f"Transcribed file: {file}, {len(result['segments'])} found.")
 
-			idx = idx + 1
-			line = f"{sliced_name}|{segment['text'].strip()}"
-			transcription.append(line)
-			with open(f'{outdir}/train.txt', 'a', encoding="utf-8") as f:
-				f.write(f'{line}\n')
+        waveform, sampling_rate = torchaudio.load(file)
+        num_channels, num_frames = waveform.shape
 
-	with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f:
-		f.write(json.dumps(results, indent='\t'))
+        idx = 0
+        for segment in result[
+            'segments']:  # enumerate_progress(result['segments'], desc="Segmenting voice file", progress=progress):
+            start = int(segment['start'] * sampling_rate)
+            end = int(segment['end'] * sampling_rate)
 
-	with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f:
-		f.write("\n".join(transcription))
+            sliced_waveform = waveform[:, start:end]
+            sliced_name = basename.replace(".wav", f"_{pad(idx, 4)}.wav")
 
-	unload_whisper()
+            if not torch.any(sliced_waveform < 0):
+                print(f"Error with {sliced_name}, skipping...")
+                continue
 
-	return f"Processed dataset to: {outdir}"
+            torchaudio.save(f"{outdir}/{sliced_name}", sliced_waveform, sampling_rate)
 
+            idx = idx + 1
+            line = f"{sliced_name}|{segment['text'].strip()}"
+            transcription.append(line)
+            with open(f'{outdir}/train.txt', 'a', encoding="utf-8") as f:
+                f.write(f'{line}\n')
+    '''
+    with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f:
+        f.write(json.dumps(results, indent='\t'))
 
-def calc_iterations(epochs, lines, batch_size):
-	iterations = int(epochs * lines / float(batch_size))
-	return iterations
+    joined = '\n'.join(transcription)
+    with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f:
+        f.write(joined)
 
+    unload_whisper()
 
-def schedule_learning_rate(iterations, schedule=EPOCH_SCHEDULE):
-	return [int(iterations * d) for d in schedule]
+    return f"Processed dataset to: {outdir}\n{joined}"
 
 
-def optimize_training_settings(epochs, learning_rate, text_ce_lr_weight, learning_rate_schedule, batch_size,
-							   gradient_accumulation_size, print_rate, save_rate, resume_path, half_p, bnb, workers,
-							   source_model, voice):
-	name = f"{voice}-finetune"
-	dataset_name = f"{voice}-train"
-	dataset_path = f"./training/{voice}/train.txt"
-	validation_name = f"{voice}-val"
-	validation_path = f"./training/{voice}/train.txt"
-
-	with open(dataset_path, 'r', encoding="utf-8") as f:
-		lines = len(f.readlines())
-
-	messages = []
-
-	if batch_size > lines:
-		batch_size = lines
-		messages.append(f"Batch size is larger than your dataset, clamping batch size to: {batch_size}")
-
-	if batch_size % lines != 0:
-		nearest_slice = int(lines / batch_size) + 1
-		batch_size = int(lines / nearest_slice)
-		messages.append(
-			f"Batch size not neatly divisible by dataset size, adjusting batch size to: {batch_size} ({nearest_slice} steps per epoch)")
-
-	if gradient_accumulation_size == 0:
-		gradient_accumulation_size = 1
-
-	if batch_size / gradient_accumulation_size < 2:
-		gradient_accumulation_size = int(batch_size / 2)
-		if gradient_accumulation_size == 0:
-			gradient_accumulation_size = 1
-
-		messages.append(
-			f"Gradient accumulation size is too large for a given batch size, clamping gradient accumulation size to: {gradient_accumulation_size}")
-	elif batch_size % gradient_accumulation_size != 0:
-		gradient_accumulation_size = int(batch_size / gradient_accumulation_size)
-		if gradient_accumulation_size == 0:
-			gradient_accumulation_size = 1
-
-		messages.append(
-			f"Batch size is not evenly divisible by the gradient accumulation size, adjusting gradient accumulation size to: {gradient_accumulation_size}")
-
-	iterations = calc_iterations(epochs=epochs, lines=lines, batch_size=batch_size)
-
-	if epochs < print_rate:
-		print_rate = epochs
-		messages.append(f"Print rate is too small for the given iteration step, clamping print rate to: {print_rate}")
-
-	if epochs < save_rate:
-		save_rate = epochs
-		messages.append(f"Save rate is too small for the given iteration step, clamping save rate to: {save_rate}")
-
-	if resume_path and not os.path.exists(resume_path):
-		resume_path = None
-		messages.append("Resume path specified, but does not exist. Disabling...")
-
-	if bnb:
-		messages.append("BitsAndBytes requested. Please note this is ! EXPERIMENTAL !")
-
-	if half_p:
-		if bnb:
-			half_p = False
-			messages.append(
-				"Half Precision requested, but BitsAndBytes is also requested. Due to redundancies, disabling half precision...")
-		else:
-			messages.append("Half Precision requested. Please note this is ! EXPERIMENTAL !")
-			if not os.path.exists(get_halfp_model_path()):
-				convert_to_halfp()
-
-	messages.append(
-		f"For {epochs} epochs with {lines} lines in batches of {batch_size}, iterating for {iterations} steps ({int(iterations / epochs)} steps per epoch)")
-
-	return (
-		learning_rate,
-		text_ce_lr_weight,
-		learning_rate_schedule,
-		batch_size,
-		gradient_accumulation_size,
-		print_rate,
-		save_rate,
-		resume_path,
-		messages
-	)
-
-
-def save_training_settings(iterations=None, learning_rate=None, text_ce_lr_weight=None, learning_rate_schedule=None,
-						   batch_size=None, gradient_accumulation_size=None, print_rate=None, save_rate=None, name=None,
-						   dataset_name=None, dataset_path=None, validation_name=None, validation_path=None,
-						   output_name=None, resume_path=None, half_p=None, bnb=None, workers=None, source_model=None):
-	if not source_model:
-		source_model = f"./models/tortoise/autoregressive{'_half' if half_p else ''}.pth"
-
-	settings = {
-		"iterations": iterations if iterations else 500,
-		"batch_size": batch_size if batch_size else 64,
-		"learning_rate": learning_rate if learning_rate else 1e-5,
-		"gen_lr_steps": learning_rate_schedule if learning_rate_schedule else EPOCH_SCHEDULE,
-		"gradient_accumulation_size": gradient_accumulation_size if gradient_accumulation_size else 4,
-		"print_rate": print_rate if print_rate else 1,
-		"save_rate": save_rate if save_rate else 50,
-		"name": name if name else "finetune",
-		"dataset_name": dataset_name if dataset_name else "finetune",
-		"dataset_path": dataset_path if dataset_path else "./training/finetune/train.txt",
-		"validation_name": validation_name if validation_name else "finetune",
-		"validation_path": validation_path if validation_path else "./training/finetune/train.txt",
-
-		"text_ce_lr_weight": text_ce_lr_weight if text_ce_lr_weight else 0.01,
-
-		'resume_state': f"resume_state: '{resume_path}'",
-		'pretrain_model_gpt': f"pretrain_model_gpt: '{source_model}'",
-
-		'float16': 'true' if half_p else 'false',
-		'bitsandbytes': 'true' if bnb else 'false',
-
-		'workers': workers if workers else 2,
-	}
-
-	if resume_path:
-		settings['pretrain_model_gpt'] = f"# {settings['pretrain_model_gpt']}"
-	else:
-		settings['resume_state'] = f"# resume_state: './training/{name if name else 'finetune'}/training_state/#.state'"
-
-	if half_p:
-		if not os.path.exists(get_halfp_model_path()):
-			convert_to_halfp()
-
-	if not output_name:
-		output_name = f'{settings["name"]}.yaml'
-
-	with open(f'./models/.template.yaml', 'r', encoding="utf-8") as f:
-		yaml = f.read()
-
-	# i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals
-	for k in settings:
-		if settings[k] is None:
-			continue
-		yaml = yaml.replace(f"${{{k}}}", str(settings[k]))
-
-	outfile = f'./training/{output_name}'
-	with open(outfile, 'w', encoding="utf-8") as f:
-		f.write(yaml)
-
-	return f"Training settings saved to: {outfile}"
-
 def calc_iterations(epochs, lines, batch_size):
     iterations = int(epochs * lines / float(batch_size))
     return iterations
@@ -2007,7 +1882,7 @@ def unload_voicefixer():
     do_gc()
 
 
-def load_whisper_model(language=None, model_name=None, progress=None):
+def load_whisper_model(model_name=None, progress=None):
     global whisper_model
 
     if not model_name:
@@ -2016,24 +1891,16 @@ def load_whisper_model(language=None, model_name=None, progress=None):
         args.whisper_model = model_name
         save_args_settings()
 
-    if language and f'{model_name}.{language}' in WHISPER_SPECIALIZED_MODELS:
-        model_name = f'{model_name}.{language}'
-        print(f"Loading specialized model for language: {language}")
-
-    notify_progress(f"Loading Whisper model: {model_name}", progress)
-
-    if args.whisper_cpp:
-        from whispercpp import Whisper
-        if not language:
-            language = 'auto'
-
-        b_lang = language.encode('ascii')
-        whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang)
+    if (torch.cuda.is_available()):
+        device = "cuda"
     else:
-        import whisper
-        whisper_model = whisper.load_model(model_name)
+        device = "cpu"
 
-    print("Loaded Whisper model")
+    notify_progress(f"Loading WhisperX model: {model_name} using {device}", progress)
+
+    whisper_model = whisperx.load_model(model_name, device)
+
+    print("Loaded WhisperX model")
 
 
 def unload_whisper():
@@ -2042,6 +1909,6 @@ def unload_whisper():
     if whisper_model:
         del whisper_model
         whisper_model = None
-        print("Unloaded Whisper")
+        print("Unloaded WhisperX")
 
     do_gc()
\ No newline at end of file