From 119ac50c583850ce0690c5bec746cbc169667726 Mon Sep 17 00:00:00 2001 From: mrq Date: Mon, 6 Mar 2023 16:50:55 +0000 Subject: [PATCH] forgot to re-append the existing transcription when skipping existing (have to go back again and do the first 10% of my giant dataset --- src/utils.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/utils.py b/src/utils.py index cee74e8..c4eea7c 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1057,8 +1057,8 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres files = sorted(files) previous_list = [] + parsed_list = [] if skip_existings and os.path.exists(f'{outdir}/train.txt'): - parsed_list = [] with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f: parsed_list = f.readlines() @@ -1109,6 +1109,9 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f: f.write(json.dumps(results, indent='\t')) + + if len(parsed_list) > 0: + transcription = parsed_list + transcription joined = '\n'.join(transcription) with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f: