forked from mrq/ai-voice-cloning
added whisper transcription 'sanitizing' (collapse very short transcriptions to the previous segment) (I really have to stop having several copies spanning several machines for AIVC, I keep reverting shit)
This commit is contained in:
parent
736cdc8926
commit
932eaccdf5
16
src/utils.py
16
src/utils.py
|
@ -1134,7 +1134,7 @@ def convert_to_halfp():
|
||||||
|
|
||||||
# collapses short segments into the previous segment
|
# collapses short segments into the previous segment
|
||||||
def whisper_sanitize( results ):
|
def whisper_sanitize( results ):
|
||||||
sanitized = results
|
sanitized = json.loads(json.dumps(results))
|
||||||
sanitized['segments'] = []
|
sanitized['segments'] = []
|
||||||
|
|
||||||
for segment in results['segments']:
|
for segment in results['segments']:
|
||||||
|
@ -1144,11 +1144,19 @@ def whisper_sanitize( results ):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
last_segment = sanitized['segments'][-1]
|
last_segment = sanitized['segments'][-1]
|
||||||
|
# segment already asimilitated it, somehow
|
||||||
|
if last_segment['end'] >= segment['end']:
|
||||||
|
continue
|
||||||
|
"""
|
||||||
|
# segment already asimilitated it, somehow
|
||||||
|
if last_segment['text'].endswith(segment['text']):
|
||||||
|
continue
|
||||||
|
"""
|
||||||
last_segment['text'] += segment['text']
|
last_segment['text'] += segment['text']
|
||||||
last_segment['end'] = segment['end']
|
last_segment['end'] = segment['end']
|
||||||
|
|
||||||
for i in range(len(sanitized['segments'])):
|
for i in range(len(sanitized['segments'])):
|
||||||
sanitized['segments']['id'] = i
|
sanitized['segments'][i]['id'] = i
|
||||||
|
|
||||||
return sanitized
|
return sanitized
|
||||||
|
|
||||||
|
@ -1275,7 +1283,9 @@ def transcribe_dataset( voice, language=None, skip_existings=False, progress=Non
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sanitized = whisper_sanitize(results[basename])
|
sanitized = whisper_sanitize(results[basename])
|
||||||
results[basename] = sanitized
|
if len(sanitized['segments']) > 0 and len(sanitized['segments'] != results[basename]['segments']):
|
||||||
|
results[basename] = sanitized
|
||||||
|
print("Segments sanizited: ", basename)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print("Failed to sanitize:", basename, e)
|
print("Failed to sanitize:", basename, e)
|
||||||
pass
|
pass
|
||||||
|
|
Loading…
Reference in New Issue
Block a user