additional cruft, added a note in documentation to be aware of NUMA node topology when running vall_e.emb.process with more than one process

2025-02-18 19:56:30 -06:00 · 2025-02-18 19:56:30 -06:00 · 92139b6da9
commit 92139b6da9
parent 596c2df11c
2 changed files with 9 additions and 3 deletions
--- a/docs/emb.md
+++ b/docs/emb.md
@ -98,6 +98,9 @@ This process can utilize sliced segments within the transcription metadata, or u

 Refer to the `__main__`'s arguments for usage details.

+> [!NOTE]
+> If you're using this to try and split your workload over multiple process / GPUs, it is *imperative* to make sure to keep each process within its own NUMA node by prefixing with `numactl -N0 -m0`, or you'll experience bottlenecks that make processing worse off compared to just doing it with one GPU.
+
 ## `similar.py`

 This script handles taking either raw input audio, or processed encoded audio, and determines the top-K similar utterances for each sample for a given speaker (or dataset).
--- a/vall_e/emb/process.py
+++ b/vall_e/emb/process.py
@ -171,6 +171,7 @@ def process(
 	input_voice=None,
 	input_metadata="metadata",
 	output_dataset="training",
+	transcription_filename="whisper.json",
 	raise_exceptions=False,
 	stride=0,
 	stride_offset=0,
@ -266,17 +267,17 @@ def process(

 				continue
 			
-			metadata_path = Path(f'./{input_metadata}/{group_name}/{speaker_id}/whisper.json')
+			metadata_path = Path(f'./{input_metadata}/{group_name}/{speaker_id}/{transcription_filename}')
 			if not metadata_path.exists():
 				missing["transcription"].append(str(metadata_path))
-				_logger.warning(f'Missing transcription metadata: ./{input_audio}/{group_name}/{speaker_id}/whisper.json')
+				_logger.warning(f'Missing transcription metadata: ./{input_audio}/{group_name}/{speaker_id}/{transcription_filename}')
 				continue

 			try:
 				metadata = json.loads(open(metadata_path, "r", encoding="utf-8").read())
 			except Exception as e:
 				missing["transcription"].append(str(metadata_path))
-				_logger.warning(f'Failed to open transcription metadata: ./{input_audio}/{group_name}/{speaker_id}/whisper.json: {e}')
+				_logger.warning(f'Failed to open transcription metadata: ./{input_audio}/{group_name}/{speaker_id}/{transcription_filename}: {e}')
 				continue

 			if f'{group_name}/{speaker_id}' not in dataset:
@ -404,6 +405,7 @@ def main():
 	parser.add_argument("--input-voice", type=str, default=None)
 	parser.add_argument("--input-metadata", type=str, default="training/metadata")
 	parser.add_argument("--output-dataset", type=str, default="training/dataset")
+	parser.add_argument("--transcription-filename", type=str, default="whisper.json")
 	parser.add_argument("--raise-exceptions", action="store_true")
 	parser.add_argument("--low-memory", action="store_true")
 	parser.add_argument("--skip-existing-folders", action="store_true")
@ -440,6 +442,7 @@ def main():
 		input_voice=args.input_voice,
 		input_metadata=args.input_metadata,
 		output_dataset=args.output_dataset,
+		transcription_filename=args.transcription_filename,
 		raise_exceptions=args.raise_exceptions,
 		stride=args.stride,
 		stride_offset=args.stride_offset,