added whisper transcription 'sanitizing' (collapse very short transcriptions to the previous segment) (I really have to stop having several copies spanning several machines for AIVC, I keep reverting shit)

This commit is contained in:
mrq 2023-03-22 22:10:01 +00:00
parent 736cdc8926
commit 932eaccdf5

View File

@ -1134,7 +1134,7 @@ def convert_to_halfp():
# collapses short segments into the previous segment # collapses short segments into the previous segment
def whisper_sanitize( results ): def whisper_sanitize( results ):
sanitized = results sanitized = json.loads(json.dumps(results))
sanitized['segments'] = [] sanitized['segments'] = []
for segment in results['segments']: for segment in results['segments']:
@ -1144,11 +1144,19 @@ def whisper_sanitize( results ):
continue continue
last_segment = sanitized['segments'][-1] last_segment = sanitized['segments'][-1]
# segment already asimilitated it, somehow
if last_segment['end'] >= segment['end']:
continue
"""
# segment already asimilitated it, somehow
if last_segment['text'].endswith(segment['text']):
continue
"""
last_segment['text'] += segment['text'] last_segment['text'] += segment['text']
last_segment['end'] = segment['end'] last_segment['end'] = segment['end']
for i in range(len(sanitized['segments'])): for i in range(len(sanitized['segments'])):
sanitized['segments']['id'] = i sanitized['segments'][i]['id'] = i
return sanitized return sanitized
@ -1275,7 +1283,9 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
try: try:
sanitized = whisper_sanitize(results[basename]) sanitized = whisper_sanitize(results[basename])
results[basename] = sanitized if len(sanitized['segments']) > 0 and len(sanitized['segments'] != results[basename]['segments']):
results[basename] = sanitized
print("Segments sanizited: ", basename)
except Exception as e: except Exception as e:
print("Failed to sanitize:", basename, e) print("Failed to sanitize:", basename, e)
pass pass