From 12c51b6057fa0a9dc575a94b7e45f8dfa62d4743 Mon Sep 17 00:00:00 2001 From: mrq Date: Mon, 6 Mar 2023 16:39:37 +0000 Subject: [PATCH] Im not too sure if manually invoking gc actually closes all the open files from whisperx (or ROCm), but it seems to have gone away longside setting 'ulimit -Sn' to half the output of 'ulimit -Hn' --- src/utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/utils.py b/src/utils.py index dcf3bc0..cee74e8 100755 --- a/src/utils.py +++ b/src/utils.py @@ -1054,6 +1054,7 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres results = {} transcription = [] + files = sorted(files) previous_list = [] if skip_existings and os.path.exists(f'{outdir}/train.txt'): @@ -1103,6 +1104,8 @@ def prepare_dataset( files, outdir, language=None, skip_existings=False, progres transcription.append(line) with open(f'{outdir}/train.txt', 'a', encoding="utf-8") as f: f.write(f'{line}\n') + + do_gc() with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f: f.write(json.dumps(results, indent='\t'))