From 230da8b5597932292f7b90fb5b87c543c5fe5079 Mon Sep 17 00:00:00 2001 From: mrq Date: Sun, 12 May 2024 13:22:08 -0500 Subject: [PATCH] should be the final things to scramble around for, DAC's 24KHz model is unusable for this, but both encodec's 24KHz and DAC's 44KHz work --- README.md | 2 +- vall_e/emb/qnt.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 987ced4..0ddd612 100755 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ For audio backends: * [`vocos`](https://huggingface.co/charactr/vocos-encodec-24khz): a higher quality EnCodec decoder. - encoding audio will use the `encodec` backend automagically, as there's no EnCodec encoder under `vocos` * [`descript-audio-codec`](https://github.com/descriptinc/descript-audio-codec): boasts better compression and quality - - **Note** models using `descript-audio-codec` at 24KHz + 6kbps will NOT converge. Unknown if 44KHz fares any better. + - **Note** models using `descript-audio-codec` at 24KHz + 8kbps will NOT converge. Audio encoded through the 44KHz seems to work. `llama`-based models also support different attention backends: * `math`: torch's SDPA's `math` implementation diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py index caa53b6..604d3eb 100755 --- a/vall_e/emb/qnt.py +++ b/vall_e/emb/qnt.py @@ -276,7 +276,7 @@ def encode(wav: Tensor, sr: int = cfg.sample_rate, device="cuda", levels=cfg.mod if not isinstance(levels, int): levels = 8 if model.model_type == "24khz" else None - with torch.autocast("cuda", dtype=torch.bfloat16, enabled=False): # or True for about 2x speed, not enabling by default for systems that do not have bfloat16 + with torch.autocast("cuda", dtype=cfg.inference.dtype, enabled=cfg.inference.amp): artifact = model.compress(signal, win_duration=None, verbose=False, n_quantizers=levels) # trim to 8 codebooks if 24Khz