added whisper transcription 'sanitizing' (collapse very short transcriptions to the previous segment) (I really have to stop having several copies spanning several machines for AIVC, I keep reverting shit)

2023-03-22 22:10:01 +00:00 · 2023-03-22 22:10:01 +00:00 · 932eaccdf5
commit 932eaccdf5
parent 736cdc8926
1 changed files with 13 additions and 3 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -1134,7 +1134,7 @@ def convert_to_halfp():

 # collapses short segments into the previous segment
 def whisper_sanitize( results ):
-	sanitized = results
+	sanitized = json.loads(json.dumps(results))
 	sanitized['segments'] = []

 	for segment in results['segments']:
@ -1144,11 +1144,19 @@ def whisper_sanitize( results ):
 			continue

 		last_segment = sanitized['segments'][-1]
+		# segment already asimilitated it, somehow
+		if last_segment['end'] >= segment['end']:
+			continue
+		"""
+		# segment already asimilitated it, somehow
+		if last_segment['text'].endswith(segment['text']):
+			continue
+		"""
 		last_segment['text'] += segment['text']
 		last_segment['end'] = segment['end']

 	for i in range(len(sanitized['segments'])):
-		sanitized['segments']['id'] = i
+		sanitized['segments'][i]['id'] = i

 	return sanitized

@ -1275,7 +1283,9 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non

 		try:
 			sanitized = whisper_sanitize(results[basename])
+			if len(sanitized['segments']) > 0 and len(sanitized['segments'] != results[basename]['segments']):
 				results[basename] = sanitized
+				print("Segments sanizited: ", basename)
 		except Exception as e:
 			print("Failed to sanitize:", basename, e)
 			pass