From fb4e81682329f11b77fb44d27bb3dc3d21e12387 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Fri, 18 Aug 2023 21:11:19 -0500
Subject: [PATCH] oops

---
 README.md         | 14 ++++----------
 data/config.yaml  |  5 ++++-
 vall_e/config.py  |  2 +-
 vall_e/emb/qnt.py | 14 ++++++--------
 vall_e/train.py   |  2 +-
 5 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index b283908..d46d4d8 100755
--- a/README.md
+++ b/README.md
@@ -64,20 +64,18 @@ python -m vall_e.emb.g2p ./data/custom
 ```
 
 
-4. Customize your configuration and define the dataset by modifying `./data/config.yml`. Refer to `./vall_e/config.py` for details. If you want to choose between different model presets, check `./vall_e/models/__init__.py`.
-
-> **Note** Be sure to set `distributd: True` to ensure the `DistributedSampler` is used. In the future, I'll have it automagically detect this.
+4. Customize your configuration and define the dataset by modifying `./data/config.yaml`. Refer to `./vall_e/config.py` for details. If you want to choose between different model presets, check `./vall_e/models/__init__.py`.
 
 If you're interested in creating an HDF5 copy of your dataset, simply invoke:
 
 ```
-python -m vall_e.data yaml='./data/config.yaml'
+python -m vall_e.data --create-hdf5 yaml='./data/config.yaml'
 ```
 
 5. Train the AR and NAR models using the following scripts:
 
 ```
-python -m vall_e.train yaml=./data/config.yml
+python -m vall_e.train yaml=./data/config.yaml
 ```
 
 You may quit your training any time by just typing `quit` in your CLI. The latest checkpoint will be automatically saved.
@@ -92,16 +90,12 @@ Two dataset formats are supported:
   - this will shove everything into a single HDF5 file and store some metadata alongside (for now, the symbol map generated, and text/audio lengths)
   - be sure to also define `use_hdf5` in your config YAML.
 
-### Training Tip
-
-Training a VALL-E model is very, very meticulous. I've fiddled with a lot of """clever""" tricks, but it seems the best is just to pick the highest LR you can get (this heavily depends on your batch size, but hyperparameters of bs=64 * ga=16 on the quarter sized model has an LR of 1.0e-3 stable, while the full size model with hyperparameters of bs=16 * ga=64 needed smaller). Like typical training, it entirely depends on your tradeoff betweeen stability and time.
-
 ### Export
 
 Both trained models *can* be exported, but is only required if loading them on systems without DeepSpeed for inferencing (Windows systems). To export the models, run:
 
 ```
-python -m vall_e.export yaml=./config/custom.yml
+python -m vall_e.export yaml=./data/config.yaml
 ```
 
 This will export the latest checkpoints.
diff --git a/data/config.yaml b/data/config.yaml
index 2fd964a..152ae61 100755
--- a/data/config.yaml
+++ b/data/config.yaml
@@ -26,13 +26,16 @@ models:
     size: "full"
     resp_levels: 1
     arch_type: "retnet"
+    prom_levels: 2
+    tasks: 8
 
   - name: "nar"
     size: "full"
     resp_levels: 1
     arch_type: "retnet"
+    prom_levels: 2
+    tasks: 8
   
-  prom_levels: 2
 
 hyperparameters:
   batch_size: 8
diff --git a/vall_e/config.py b/vall_e/config.py
index 9318393..640859d 100755
--- a/vall_e/config.py
+++ b/vall_e/config.py
@@ -138,7 +138,7 @@ class Model:
 	size: str = "full"
 	resp_levels: int = 1
 	prom_levels: int = 8
-	tasks: int = 8 # ["tts", "ns", "sr", "tse", "cse", "nse"] and leaves two more for anything else I want (like "svc")
+	tasks: int = 1 # 8 # ["tts", "ns", "sr", "tse", "cse", "nse"] and leaves two more for anything else I want (like "svc")
 	arch_type: str = "transformer"
 
 	@property
diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py
index dba7938..4b425a5 100755
--- a/vall_e/emb/qnt.py
+++ b/vall_e/emb/qnt.py
@@ -26,11 +26,11 @@ def _load_encodec_model(device="cuda"):
 	assert cfg.sample_rate == 24_000
 
 	# too lazy to un-if ladder this shit
-	if cfg.models.levels == 2:
+	if cfg.models.prom_levels == 2:
 		bandwidth_id = 1.5
-	elif cfg.models.levels == 4:
+	elif cfg.models.prom_levels == 4:
 		bandwidth_id = 3.0
-	elif cfg.models.levels == 8:
+	elif cfg.models.prom_levels == 8:
 		bandwidth_id = 6.0
 
 	model = EncodecModel.encodec_model_24khz().to(device)
@@ -49,11 +49,11 @@ def _load_vocos_model(device="cuda"):
 	model = model.to(device)
 
 	# too lazy to un-if ladder this shit
-	if cfg.models.levels == 2:
+	if cfg.models.prom_levels == 2:
 		bandwidth_id = 0
-	elif cfg.models.levels == 4:
+	elif cfg.models.prom_levels == 4:
 		bandwidth_id = 1
-	elif cfg.models.levels == 8:
+	elif cfg.models.prom_levels == 8:
 		bandwidth_id = 2
 
 	model.bandwidth_id = torch.tensor([bandwidth_id], device=device)
@@ -142,8 +142,6 @@ def encode(wav: Tensor, sr: int, device="cuda"):
 
 	encoded_frames = model.encode(wav)
 	qnt = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)  # (b q t)
-	
-	# duration = qnt.shape[-1] / 75
 
 	return qnt
 
diff --git a/vall_e/train.py b/vall_e/train.py
index 3a67ea3..2e984c7 100755
--- a/vall_e/train.py
+++ b/vall_e/train.py
@@ -93,7 +93,7 @@ def run_eval(engines, eval_name, dl):
 				stats['loss'].append(mel_stft_loss(hyp_audio, ref_audio).item())
 			except Exception as e:
 				stats['loss'].append(0)
-				print(str(e))
+				print(traceback.format_exc())
 	
 	processed = 0
 	for batch in tqdm(dl):