From b4405c98ea16920f5f514a7981d0ee6ae25fc02e Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Tue, 10 Oct 2023 19:18:24 -0500
Subject: [PATCH] remove double spaces in the text phonemes (might have caused
 problems.........)

---
 vall_e/data.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/vall_e/data.py b/vall_e/data.py
index 5c1c6ee..08244c9 100755
--- a/vall_e/data.py
+++ b/vall_e/data.py
@@ -143,9 +143,14 @@ def _get_paths_of_extensions( path, extensions=".qnt.pt", validate=False ):
 def _load_quants(path) -> Tensor:
 	return torch.load(_get_quant_path(path))[0][:, :].t().to(torch.int16)
 
+# prune consecutive spaces
+def _cleanup_phones( phones, targets=[" "]):
+	return [ p for i, p in enumerate(phones) if p not in targets or ( p in targets and p != phones[i-1] ) ]
+
 @cache
 def _get_phones(path, language="en"):
 	content = open(_get_phone_path(path), "r", encoding="utf-8").read().split(" ")
+	content = _cleanup_phones( content )
 	return ["<s>"] + [ " " if not p else p for p in content ] + ["</s>"]
 
 def _interleaved_reorder(l, fn):
@@ -333,8 +338,13 @@ class Dataset(_Dataset):
 
 		if cfg.dataset.use_hdf5:
 			key = _get_hdf5_path(path)
-			text = torch.from_numpy(cfg.hdf5[key]["text"][:]).to(self.text_dtype)
-			resps = torch.from_numpy(cfg.hdf5[key]["audio"][:, :]).to(torch.int16)
+			text = cfg.hdf5[key]["text"][:]
+			resps = cfg.hdf5[key]["audio"][:, :]
+
+			text = np.array( _cleanup_phones( text, targets=[ self.phone_symmap[" "] ] ) )
+			
+			text = torch.from_numpy(text).to(self.text_dtype)
+			resps = torch.from_numpy(resps).to(torch.int16)
 		else:
 			text = torch.tensor([*map(self.phone_symmap.get, _get_phones(path))]).to(self.text_dtype)
 			resps = _load_quants(path)