my sanitizer actually did work, it was just batch sizes leading to problems when transcribing

2023-03-23 04:41:56 +00:00 · 2023-03-23 04:41:56 +00:00 · 444bcdaf62
commit 444bcdaf62
parent a6daf289bc
1 changed files with 30 additions and 24 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -1206,10 +1206,13 @@ def whisper_transcribe( file, language=None ):

 		device = "cuda" if get_device_name() == "cuda" else "cpu"
 		if whisper_vad:
+			"""
 			if args.whisper_batchsize > 1:
 				result = whisperx.transcribe_with_vad_parallel(whisper_model, file, whisper_vad, batch_size=args.whisper_batchsize, language=language, task="transcribe")
 			else:
 				result = whisperx.transcribe_with_vad(whisper_model, file, whisper_vad)
+			"""
+			result = whisperx.transcribe_with_vad(whisper_model, file, whisper_vad)
 		else:
 			result = whisper_model.transcribe(file)
 			
@ -1282,19 +1285,32 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
 	for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
 		basename = os.path.basename(file)

-		modified = False
 		if basename in results and skip_existings:
 			print(f"Skipping already parsed file: {basename}")
-		else:
+			continue
+
 		try:
 			result = whisper_transcribe(file, language=language)
-				modified = True
 		except Exception as e:
 			print("Failed to transcribe:", file)
 			continue
-			results[basename] = result

-		"""
+		results[basename] = result
+		waveform, sample_rate = torchaudio.load(file)
+		# resample to the input rate, since it'll get resampled for training anyways
+		# this should also "help" increase throughput a bit when filling the dataloaders
+		waveform, sample_rate = resample(waveform, sample_rate, TARGET_SAMPLE_RATE)
+		if waveform.shape[0] == 2:
+			waveform = waveform[:1]
+		torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)
+
+		with open(infile, 'w', encoding="utf-8") as f:
+			f.write(json.dumps(results, indent='\t'))
+
+		do_gc()
+
+	modified = False
+	for basename in results:
 		try:
 			sanitized = whisper_sanitize(results[basename])
 			if len(sanitized['segments']) > 0 and len(sanitized['segments']) != len(results[basename]['segments']):
@ -1304,22 +1320,12 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
 		except Exception as e:
 			print("Failed to sanitize:", basename, e)
 			pass
-		"""
-
-		waveform, sample_rate = torchaudio.load(file)
-		# resample to the input rate, since it'll get resampled for training anyways
-		# this should also "help" increase throughput a bit when filling the dataloaders
-		waveform, sample_rate = resample(waveform, sample_rate, TARGET_SAMPLE_RATE)
-		if waveform.shape[0] == 2:
-			waveform = waveform[:1]
-		torchaudio.save(f"{indir}/audio/{basename}", waveform, sample_rate, encoding="PCM_S", bits_per_sample=16)

 	if modified:
+		os.rename(infile, infile.replace(".json", ".unsanitized.json"))
 		with open(infile, 'w', encoding="utf-8") as f:
 			f.write(json.dumps(results, indent='\t'))

-		do_gc()
-
 	return f"Processed dataset to: {indir}"

 def slice_waveform( waveform, sample_rate, start, end, trim ):