added option to not slice audio when transcribing, added option to prepare validation dataset on audio duration, added a warning if youre using whisperx and you're slicing audio

2023-03-11 16:32:35 +00:00 · 2023-03-11 16:32:35 +00:00 · 2424c455cb
commit 2424c455cb
parent dcdcf8516c
2 changed files with 69 additions and 37 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -667,7 +667,7 @@ class TrainingState():
 			self.steps = int(self.info['steps'])
 		if 'iteration_rate' in self.info:
-			it_rate = self.info['iteration_rate']
+			it_rate = self.info['iteration_rate'] / self.batch_size # why
 			self.it_rate = f'{"{:.3f}".format(1/it_rate)}it/s' if 0 < it_rate and it_rate < 1 else f'{"{:.3f}".format(it_rate)}s/it'
 			self.it_rates += it_rate
@ -676,6 +676,7 @@ class TrainingState():
 				eta = str(timedelta(seconds=int(self.eta)))
 				self.eta_hhmmss = eta
 			except Exception as e:
 				self.eta_hhmmss = "?"
 				pass
 		self.metrics['step'] = [f"{self.epoch}/{self.epochs}"]
@ -1064,13 +1065,16 @@ def whisper_transcribe( file, language=None ):
 		return result
-def prepare_dataset( files, outdir, language=None, skip_existings=False, progress=None ):
+def prepare_dataset( files, outdir, language=None, skip_existings=False, slice_audio=False, progress=None ):
 	unload_tts()
 	global whisper_model
 	if whisper_model is None:
 		load_whisper_model(language=language)
 	if args.whisper_backend == "m-bain/whisperx" and slice_audio:
 		print("! CAUTION ! Slicing audio with whisperx is terrible. Please consider using a different whisper backend if you want to slice audio.")
 	os.makedirs(f'{outdir}/audio/', exist_ok=True)
 	results = {}
@ -1092,6 +1096,14 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
 			if match[0] not in previous_list:
 				previous_list.append(f'{match[0].split("/")[-1]}.wav')
 	def validate_waveform( waveform, sample_rate, name ):
 		if not torch.any(waveform < 0):
 			return False
 		if waveform.shape[-1] < (.6 * sampling_rate):
 			return False
 		return True
 	for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
 		basename = os.path.basename(file)
@ -1106,29 +1118,36 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
 		waveform, sampling_rate = torchaudio.load(file)
 		num_channels, num_frames = waveform.shape
-		idx = 0
+		if not slice_audio:
-		for segment in result['segments']: # enumerate_progress(result['segments'], desc="Segmenting voice file", progress=progress):
+			if not validate_waveform( waveform, sampling_rate, name ):
-			start = int(segment['start'] * sampling_rate)
+				print(f"Segment invalid: {name}, skipping...")
 			end = int(segment['end'] * sampling_rate)
 			sliced_waveform = waveform[:, start:end]
 			sliced_name = basename.replace(".wav", f"_{pad(idx, 4)}.wav")
 			if not torch.any(sliced_waveform < 0):
 				print(f"Sound file is silent: {sliced_name}, skipping...")
 				continue
-			if sliced_waveform.shape[-1] < (.6 * sampling_rate):
+			torchaudio.save(f"{outdir}/audio/{basename}", waveform, sampling_rate)
-				print(f"Sound file is too short: {sliced_name}, skipping...")
+			line = f"audio/{basename}|{result['text'].strip()}"
 				continue
 			torchaudio.save(f"{outdir}/audio/{sliced_name}", sliced_waveform, sampling_rate)
 			idx = idx + 1
 			line = f"audio/{sliced_name}|{segment['text'].strip()}"
 			transcription.append(line)
 			with open(f'{outdir}/train.txt', 'a', encoding="utf-8") as f:
 				f.write(f'\n{line}')
 		else:
 			idx = 0
 			for segment in result['segments']: # enumerate_progress(result['segments'], desc="Segmenting voice file", progress=progress):
 				start = int(segment['start'] * sampling_rate)
 				end = int(segment['end'] * sampling_rate)
 				sliced_waveform = waveform[:, start:end]
 				sliced_name = basename.replace(".wav", f"_{pad(idx, 4)}.wav")
 				if not validate_waveform( sliced_waveform, sampling_rate, sliced_name ):
 					print(f"Trimmed segment invalid: {sliced_name}, skipping...")
 					continue
 				torchaudio.save(f"{outdir}/audio/{sliced_name}", sliced_waveform, sampling_rate)
 				idx = idx + 1
 				line = f"audio/{sliced_name}|{segment['text'].strip()}"
 				transcription.append(line)
 				with open(f'{outdir}/train.txt', 'a', encoding="utf-8") as f:
 					f.write(f'\n{line}')
 		do_gc()
@ -1144,7 +1163,7 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
 	return f"Processed dataset to: {outdir}\n{joined}"
-def prepare_validation_dataset( voice, text_length ):
+def prepare_validation_dataset( voice, text_length, audio_length ):
 	indir = f'./training/{voice}/'
 	infile = f'{indir}/dataset.txt'
 	if not os.path.exists(infile):
@ -1166,8 +1185,14 @@ def prepare_validation_dataset( voice, text_length ):
 		split = line.split("|")
 		filename = split[0]
 		text = split[1]
 		culled = len(text) < text_length
-		if len(text) < text_length:
+		if not culled and audio_length > 0:
 			metadata = torchaudio.info(f'{indir}/{filename}')
 			duration = metadata.num_channels * metadata.num_frames / metadata.sample_rate
 			culled = duration < audio_length
 		if culled:
 			validation.append(line.strip())
 		else:
 			training.append(line.strip())
@ -1178,7 +1203,7 @@ def prepare_validation_dataset( voice, text_length ):
 	with open(f'{indir}/validation.txt', 'w', encoding="utf-8") as f:
 		f.write("\n".join(validation))
-	msg = f"Culled {len(validation)} lines"
+	msg = f"Culled {len(validation)}/{len(lines)} lines."
 	print(msg)
 	return msg
@ -1896,6 +1921,9 @@ def load_tts( restart=False, autoregressive_model=None ):
 	print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
 	if get_device_name() == "cpu":
 		print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
 	tts_loading = True
 	try:
 		tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model)
--- a/src/webui.py
+++ b/src/webui.py
@ -152,9 +152,7 @@ def import_generate_settings_proxy( file=None ):
 	res = []
 	for k in GENERATE_SETTINGS_ARGS:
 		res.append(settings[k] if k in settings else None)
-	print(GENERATE_SETTINGS_ARGS)
+
 	print(settings)
 	print(res)
 	return tuple(res)
 def compute_latents_proxy(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm=True)):
@ -184,12 +182,12 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
 		gr.update(visible=j is not None),
 	)
-def prepare_dataset_proxy( voice, language, validation_size, skip_existings, progress=gr.Progress(track_tqdm=True) ):
+def prepare_dataset_proxy( voice, language, validation_text_length, validation_audio_length, skip_existings, slice_audio, progress=gr.Progress(track_tqdm=True) ):
 	messages = []
-	message = prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, skip_existings=skip_existings, progress=progress )
+	message = prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, skip_existings=skip_existings, slice_audio=slice_audio, progress=progress )
 	messages.append(message)
-	if validation_size > 0:
+	if validation_text_length > 0 or validation_audio_length > 0:
-		message = prepare_validation_dataset( voice, text_length=validation_size )
+		message = prepare_validation_dataset( voice, text_length=validation_text_length, audio_length=validation_audio_length )
 		messages.append(message)
 	return "\n".join(messages)
@ -246,8 +244,7 @@ def import_training_settings_proxy( voice ):
 		output[k] = settings[k]
 	output = list(output.values())
-	print(list(TRAINING_SETTINGS.keys()))
+
 	print(output)
 	messages.append(f"Imported training settings: {injson}")
 	return output[:-1] + ["\n".join(messages)]
@ -413,13 +410,20 @@ def setup_gradio():
 						DATASET_SETTINGS['voice'] = gr.Dropdown( choices=voice_list, label="Dataset Source", type="value", value=voice_list[0] if len(voice_list) > 0 else "" )
 						with gr.Row():
 							DATASET_SETTINGS['language'] = gr.Textbox(label="Language", value="en")
-							DATASET_SETTINGS['validation_size'] = gr.Number(label="Validation Text Length Cull Size", value=12, precision=0)
+							DATASET_SETTINGS['validation_text_length'] = gr.Number(label="Validation Text Length Threshold", value=12, precision=0)
-						DATASET_SETTINGS['skip'] = gr.Checkbox(label="Skip Already Transcribed", value=False)
+							DATASET_SETTINGS['validation_audio_length'] = gr.Number(label="Validation Audio Length Threshold", value=1 )
 						with gr.Row():
 							DATASET_SETTINGS['skip'] = gr.Checkbox(label="Skip Already Transcribed", value=False)
 							DATASET_SETTINGS['slice'] = gr.Checkbox(label="Slice Segments", value=False)
 						with gr.Row():
 							transcribe_button = gr.Button(value="Transcribe")
 							prepare_validation_button = gr.Button(value="Prepare Validation")
 						with gr.Row():
 							EXEC_SETTINGS['whisper_backend'] = gr.Dropdown(WHISPER_BACKENDS, label="Whisper Backends", value=args.whisper_backend)
 							EXEC_SETTINGS['whisper_model'] = gr.Dropdown(WHISPER_MODELS, label="Whisper Model", value=args.whisper_model)
 						dataset_settings = list(DATASET_SETTINGS.values())
 					with gr.Column():
 						prepare_dataset_output = gr.TextArea(label="Console Output", interactive=False, max_lines=8)
@ -533,8 +537,7 @@ def setup_gradio():
 					EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
 					EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
-					EXEC_SETTINGS['whisper_backend'] = gr.Dropdown(WHISPER_BACKENDS, label="Whisper Backends", value=args.whisper_backend)
+					
 					EXEC_SETTINGS['whisper_model'] = gr.Dropdown(WHISPER_MODELS, label="Whisper Model", value=args.whisper_model)
 					EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
 					EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
@ -739,7 +742,8 @@ def setup_gradio():
 			prepare_validation_dataset,
 			inputs=[
 				dataset_settings[0],
-				DATASET_SETTINGS['validation_size'],
+				DATASET_SETTINGS['validation_text_length'],
 				DATASET_SETTINGS['validation_audio_length'],
 			],
 			outputs=prepare_dataset_output #console_output
 		)