forgot to re-append the existing transcription when skipping existing (have to go back again and do the first 10% of my giant dataset

2023-03-06 16:50:55 +00:00 · 2023-03-06 16:50:55 +00:00 · 119ac50c58
commit 119ac50c58
parent da0af4c498
1 changed files with 4 additions and 1 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -1057,8 +1057,8 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
 	files = sorted(files)
 	previous_list = []
 	parsed_list = []
 	if skip_existings and os.path.exists(f'{outdir}/train.txt'):
 		parsed_list = []
 		with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f:
 			parsed_list = f.readlines()
@ -1109,6 +1109,9 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
 	with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f:
 		f.write(json.dumps(results, indent='\t'))
 	if len(parsed_list) > 0:
 		transcription = parsed_list + transcription
 	joined = '\n'.join(transcription)
 	with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f: