added arg to skip processing speakers with not enough utterances for whenever I get around to processing my subest of Emilia for nvidia/audio-codec-44khz (because Emilia has a ton of low-utternace speaker counts and right now my focus with the nemo model is on getting it to actually speak without much problems rather than feed it a gorillion speakers)

2025-02-18 10:49:21 -06:00 · 2025-02-18 10:49:21 -06:00 · 596c2df11c
commit 596c2df11c
parent 8331eee6fa
1 changed files with 6 additions and 0 deletions
--- a/vall_e/emb/process.py
+++ b/vall_e/emb/process.py
@ -178,6 +178,7 @@ def process(
 	batch_size=1,
 	max_duration=None,
 	max_samples=None,
+	min_utterances=None,
 	skip_existing_folders=False,
 	low_memory=False,
 	strict_languages=False,
@ -334,6 +335,9 @@ def process(
 					i = 0
 					presliced = not inpath.exists()
 					
+					if min_utterances and len(metadata[filename]["segments"]) < min_utterances:
+						continue
+
 					for segment in metadata[filename]["segments"]:
 						id = pad(i, 4)
 						i = i + 1
@ -410,6 +414,7 @@ def main():
 	parser.add_argument("--batch-size", type=int, default=0)
 	parser.add_argument("--max-duration", type=int, default=0)
 	parser.add_argument("--max-samples", type=int, default=0)
+	parser.add_argument("--min-utterances", type=int, default=0)
 	
 	parser.add_argument("--device", type=str, default="cuda")
 	parser.add_argument("--dtype", type=str, default="bfloat16")
@ -442,6 +447,7 @@ def main():
 		batch_size=args.batch_size,
 		max_duration=args.max_duration,
 		max_samples=args.max_samples,
+		min_utterances=args.min_utterances,
 		skip_existing_folders=args.skip_existing_folders,
 		strict_languages=args.strict_languages,