converting over to a different intermediary dataset format

2024-04-18 21:24:06 -05:00 · 2024-04-18 21:24:06 -05:00 · 8214aa23d7
commit 8214aa23d7
parent 4f5c9e518a
2 changed files with 60 additions and 33 deletions
--- a/vall_e/data.py
+++ b/vall_e/data.py
@ -63,11 +63,17 @@ def get_task_symmap():
 def _replace_file_extension(path, suffix):
 	return (path.parent / path.name.split(".")[0]).with_suffix(suffix)
 def _get_quant_extension():
 	return ".dac"
 def _get_phone_extension():
 	return ".json"
 def _get_quant_path(path):
-	return _replace_file_extension(path, ".qnt.pt")
+	return _replace_file_extension(path, _get_quant_extension())
 def _get_phone_path(path):
-	return _replace_file_extension(path, ".phn.txt")
+	return _replace_file_extension(path, _get_phone_extension())
 _total_durations = {}
@ -101,7 +107,7 @@ def _load_paths_from_metadata(data_dir, type="training", validate=False):
 		metadata = json.loads(open( metadata_path, "r", encoding="utf-8" ).read())
 	if len(metadata) == 0:
-		return _fn( data_dir, type if cfg.dataset.use_hdf5 else ".qnt.pt", validate )
+		return _fn( data_dir, type if cfg.dataset.use_hdf5 else _get_quant_extension(), validate )
 	def key( dir, id ):
 		if not cfg.dataset.use_hdf5:
@ -134,7 +140,7 @@ def _get_hdf5_paths( data_dir, type="training", validate=False ):
 	key = f"/{type}{_get_hdf5_path(data_dir)}"
 	return [ Path(f"{key}/{child.attrs['id']}") for child in cfg.hdf5[key].values() if not validate or _validate(child) ] if key in cfg.hdf5 else []
-def _get_paths_of_extensions( path, extensions=".qnt.pt", validate=False ):
+def _get_paths_of_extensions( path, extensions=_get_quant_extension(), validate=False ):
 	if isinstance(path, str):
 		path = Path(path)
@ -154,6 +160,10 @@ def _get_paths_of_extensions( path, extensions=".qnt.pt", validate=False ):
 	return [ p for p in list(path.iterdir()) if _validate(p) ] if path.exists() and path.is_dir() else []
 def _load_quants(path) -> Tensor:
 	if _get_quant_extension() == ".dac":
 		qnt = np.load(_get_quant_path(path), allow_pickle=True)[()]
 		return torch.from_numpy(qnt["codes"].astype(int))[0][:, :].t().to(torch.int16)
 	return torch.load(_get_quant_path(path))[0][:, :].t().to(torch.int16)
 # prune consecutive spaces
@ -162,8 +172,12 @@ def _cleanup_phones( phones, targets=[" "]):
@cache
 def _get_phones(path, language="en"):
-	content = open(_get_phone_path(path), "r", encoding="utf-8").read().split(" ")
+	if _get_quant_extension() == ".json":
-	content = _cleanup_phones( content )
+		metadata = json.loads(open(_get_phone_path(path), "r", encoding="utf-8").read())
 		content = metadata["phonemes"]
 	else:
 		content = open(_get_phone_path(path), "r", encoding="utf-8").read().split(" ")
 		content = _cleanup_phones( content )
 	return ["<s>"] + [ " " if not p else p for p in content ] + ["</s>"]
 def _interleaved_reorder(l, fn):
@ -807,11 +821,12 @@ def create_dataset_hdf5( skip_existing=True ):
 		files = os.listdir(f'{root}/{name}/')
 		# grab IDs for every file
-		ids = { ".".join(file.split(".")[:-2]) for file in files }
+		ids = { file.replace(_get_quant_extension(), "").replace(_get_phone_extension(), "") for file in files }
 		for id in tqdm(ids, desc=f"Processing {name}"):
 			try:
-				audio_exists = os.path.exists(f'{root}/{name}/{id}.qnt.pt') if audios else True
+				audio_exists = os.path.exists(f'{root}/{name}/{id}{_get_quant_extension()}') if audios else True
-				text_exists = os.path.exists(f'{root}/{name}/{id}.phn.txt') if texts else True
+				text_exists = os.path.exists(f'{root}/{name}/{id}{_get_phone_extension()}') if texts else True
 				if not audio_exists or not text_exists:
 					continue
@ -831,21 +846,34 @@ def create_dataset_hdf5( skip_existing=True ):
 				# audio
 				if audios:
-					qnt = torch.load(f'{root}/{name}/{id}.qnt.pt')[0].t()
+					qnt = np.load(f'{root}/{name}/{id}{_get_quant_extension()}', allow_pickle=True)[()]
 					codes = torch.from_numpy(qnt["codes"].astype(int))[0].t()
 					if _get_quant_extension() == ".dac":
 						if "audio" in group:
 							del group["audio"]
 						duration = qnt["metadata"]["original_length"] / qnt["metadata"]["sample_rate"]
 						metadata[id]["metadata"] = qnt["metadata"]
 					else:
 						qnt = torch.load(f'{root}/{name}/{id}{_get_quant_extension()}')[0].t()
 						duration = qnt.shape[0] / 75
 					if "audio" in group:
 						del group["audio"]
 					group.create_dataset('audio', data=qnt.numpy(), compression='lzf')
-					group.attrs['duration'] = qnt.shape[0] # / 75
+
-					metadata[id]["duration"] = qnt.shape[0] # / 75
+					group.attrs['duration'] = duration
 					metadata[id]["duration"] = duration
 				else:
 					group.attrs['duration'] = 0
 					metadata[id]["duration"] = 0
 				# text
 				if texts:
-					"""
+					if _get_quant_extension() == ".json":
-					content = open(f'{root}/{name}/{id}.phn.txt', "r", encoding="utf-8") .read().split(" ")
+						j_son = json.loads(open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read())
 						content = j_son["phonemes"]
 					else:
 						content = open(f'{root}/{name}/{id}{_get_phone_extension()}', "r", encoding="utf-8").read().split(" ")
 					phones = [f"<s>"] + [ " " if not p else p for p in content ] + [f"</s>"]
 					for s in set(phones):
 						if s not in symmap:
@ -858,7 +886,6 @@ def create_dataset_hdf5( skip_existing=True ):
 					group.create_dataset('text', data=phn, compression='lzf', chunks=True)
 					group.create_dataset('transcription', data=txt, compression='lzf', chunks=True)
 					"""
 					group.attrs['phonemes'] = len(phn)
 					metadata[id]["phones"] = len(phn)
--- a/vall_e/emb/qnt.py
+++ b/vall_e/emb/qnt.py
@ -66,22 +66,22 @@ try:
 		# to-do, original implementation
 		"""
 		resample_fn = recons.resample
        loudness_fn = recons.loudness
        # If audio is > 10 minutes long, use the ffmpeg versions
        if recons.signal_duration >= 10 * 60 * 60:
            resample_fn = recons.ffmpeg_resample
            loudness_fn = recons.ffmpeg_loudness
        recons.normalize(obj.input_db)
        resample_fn(obj.sample_rate)
        recons = recons[..., : obj.original_length]
        loudness_fn()
        recons.audio_data = recons.audio_data.reshape(
            -1, obj.channels, obj.original_length
        )
 		"""
 		resample_fn = recons.resample
 		loudness_fn = recons.loudness
 		# If audio is > 10 minutes long, use the ffmpeg versions
 		if recons.signal_duration >= 10 * 60 * 60:
 			resample_fn = recons.ffmpeg_resample
 			loudness_fn = recons.ffmpeg_loudness
 		recons.normalize(obj.input_db)
 		resample_fn(obj.sample_rate)
 		recons = recons[..., : obj.original_length]
 		loudness_fn()
 		recons.audio_data = recons.audio_data.reshape(
 			-1, obj.channels, obj.original_length
 		)
 		self.padding = original_padding
 		return recons
@ -228,7 +228,7 @@ def decode(codes: Tensor, device="cuda", levels=cfg.model.max_levels, metadata=N
 			dac_version = metadata["dac_version"] if isinstance(metadata, dict) else metadata.dac_version,
 		)
-		return model.decompress(artifact, verbose=False).audio_data[0], model.sample_rate
+		return model.decompress(artifact, verbose=False).audio_data[0], artifact.sample_rate
 	kwargs = {}