From 230da8b5597932292f7b90fb5b87c543c5fe5079 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Sun, 12 May 2024 13:22:08 -0500
Subject: [PATCH] should be the final things to scramble around for, DAC's
 24KHz model is unusable for this, but both encodec's 24KHz and DAC's 44KHz
 work

---
 README.md         | 2 +-
 vall_e/emb/qnt.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 987ced4..0ddd612 100755
--- a/README.md
+++ b/README.md
@@ -143,7 +143,7 @@ For audio backends:
 * [`vocos`](https://huggingface.co/charactr/vocos-encodec-24khz): a higher quality EnCodec decoder.
   - encoding audio will use the `encodec` backend automagically, as there's no EnCodec encoder under `vocos`
 * [`descript-audio-codec`](https://github.com/descriptinc/descript-audio-codec): boasts better compression and quality
-  - **Note** models using `descript-audio-codec` at 24KHz + 6kbps will NOT converge. Unknown if 44KHz fares any better.
+  - **Note** models using `descript-audio-codec` at 24KHz + 8kbps will NOT converge. Audio encoded through the 44KHz seems to work.
 
 `llama`-based models also support different attention backends:
 * `math`: torch's SDPA's `math` implementation
diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py
index caa53b6..604d3eb 100755
--- a/vall_e/emb/qnt.py
+++ b/vall_e/emb/qnt.py
@@ -276,7 +276,7 @@ def encode(wav: Tensor, sr: int = cfg.sample_rate, device="cuda", levels=cfg.mod
 		if not isinstance(levels, int):
 			levels = 8 if model.model_type == "24khz" else None
 
-		with torch.autocast("cuda", dtype=torch.bfloat16, enabled=False): # or True for about 2x speed, not enabling by default for systems that do not have bfloat16 
+		with torch.autocast("cuda", dtype=cfg.inference.dtype, enabled=cfg.inference.amp):
 			artifact = model.compress(signal, win_duration=None, verbose=False, n_quantizers=levels)
 
 		# trim to 8 codebooks if 24Khz