forgot to re-append the existing transcription when skipping existing (have to go back again and do the first 10% of my giant dataset

2023-03-06 16:50:55 +00:00 · 2023-03-06 16:50:55 +00:00 · 119ac50c58
commit 119ac50c58
parent da0af4c498
1 changed files with 4 additions and 1 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -1057,8 +1057,8 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
 	files = sorted(files)

 	previous_list = []
+	parsed_list = []
 	if skip_existings and os.path.exists(f'{outdir}/train.txt'):
-		parsed_list = []
 		with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f:
 			parsed_list = f.readlines()

@ -1109,6 +1109,9 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres
 	
 	with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f:
 		f.write(json.dumps(results, indent='\t'))
+
+	if len(parsed_list) > 0:
+		transcription = parsed_list + transcription
 	
 	joined = '\n'.join(transcription)
 	with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f: