forgot to re-append the existing transcription when skipping existing (have to go back again and do the first 10% of my giant dataset

This commit is contained in:
mrq 2023-03-06 16:50:55 +00:00
parent da0af4c498
commit 119ac50c58

View File

@ -1057,8 +1057,8 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
files = sorted(files) files = sorted(files)
previous_list = [] previous_list = []
parsed_list = []
if skip_existings and os.path.exists(f'{outdir}/train.txt'): if skip_existings and os.path.exists(f'{outdir}/train.txt'):
parsed_list = []
with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f: with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f:
parsed_list = f.readlines() parsed_list = f.readlines()
@ -1110,6 +1110,9 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f: with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(results, indent='\t')) f.write(json.dumps(results, indent='\t'))
if len(parsed_list) > 0:
transcription = parsed_list + transcription
joined = '\n'.join(transcription) joined = '\n'.join(transcription)
with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f: with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f:
f.write(joined) f.write(joined)