From fe241f6a9936e2d77cc4a41a3fd3ed81b388308a Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Wed, 18 Sep 2024 21:34:43 -0500
Subject: [PATCH] support for wildcard in training/validation/noise dataset
 array (to-do: a better way to query between metadata folder and data folder)

---
 scripts/process_emilia.py |  1 -
 vall_e/config.py          | 33 ++++++++++++++++++++++++++++++---
 vall_e/data.py            |  2 +-
 3 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/scripts/process_emilia.py b/scripts/process_emilia.py
index 7a9bbe7..75ca908 100644
--- a/scripts/process_emilia.py
+++ b/scripts/process_emilia.py
@@ -138,7 +138,6 @@ def process(
 				speaker_id = metadata["speaker"]
 				outpath = Path(f'./{output_dataset}/{group_name}/{speaker_id}/{fname}.{extension}')
 
-
 				if _replace_file_extension(outpath, audio_extension).exists():
 					continue
 
diff --git a/vall_e/config.py b/vall_e/config.py
index 8383454..9657f80 100755
--- a/vall_e/config.py
+++ b/vall_e/config.py
@@ -10,6 +10,7 @@ import argparse
 import yaml
 import random
 import logging
+import itertools
 
 import torch
 import numpy as np
@@ -802,6 +803,27 @@ class Config(BaseConfig):
 			_logger.warning(f"Error while opening HDF5 file: {self.rel_path}/{self.dataset.hdf5_name}: {str(e)}")
 			self.dataset.use_hdf5 = False
 
+	# a very icky way to handle wildcard expansions
+	def expand( self, path ):
+		if not isinstance( path, Path ):
+			path = Path(path)
+
+		# do not glob
+		if "*" not in str(path):
+			return [ path ]
+		
+		metadata_parent = cfg.metadata_dir / path.parent
+		data_parent = cfg.data_dir / path.parent
+		
+		if metadata_parent.exists():
+			return [ path.parent / child.stem for child in Path(metadata_parent).glob(path.name) ]
+
+		if data_parent.exists():
+			return [ path.parent / child.name for child in Path(data_parent).glob(path.name) ]
+		
+		return path
+
+
 	# to-do: prune unused keys
 	def format( self, training=True ):
 		if isinstance(self.dataset, type):
@@ -829,9 +851,14 @@ class Config(BaseConfig):
 			self.optimizations = dict()
 
 		self.dataset = Dataset(**self.dataset)
-		self.dataset.training = [ Path(dir) for dir in self.dataset.training ]
-		self.dataset.validation = [ Path(dir) for dir in self.dataset.validation ]
-		self.dataset.noise = [ Path(dir) for dir in self.dataset.noise ]
+		# convert to expanded paths
+		self.dataset.training = [ self.expand(dir) for dir in self.dataset.training ]
+		self.dataset.validation = [ self.expand(dir) for dir in self.dataset.validation ]
+		self.dataset.noise = [ self.expand(dir) for dir in self.dataset.noise ]
+		# flatten
+		self.dataset.training = list(itertools.chain.from_iterable(self.dataset.training))
+		self.dataset.validation = list(itertools.chain.from_iterable(self.dataset.validation))
+		self.dataset.noise = list(itertools.chain.from_iterable(self.dataset.noise))
 
 		# do cleanup
 		for model in self.models:
diff --git a/vall_e/data.py b/vall_e/data.py
index f0b0904..6249cb6 100755
--- a/vall_e/data.py
+++ b/vall_e/data.py
@@ -1413,7 +1413,7 @@ def create_dataset_metadata( skip_existing=True ):
 
 		wrote = False
 
-		for id in tqdm(ids, desc=f"Processing {name}"):
+		for id in tqdm(ids, desc=f"Processing {name}", disable=True):
 			try:
 				quant_path = Path(f'{root}/{name}/{id}{_get_quant_extension()}')