From 9f738fbd5b93cf5760436b8d3b6ae4f64b47f04d Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 4 May 2024 23:09:18 -0500 Subject: [PATCH] seems I actually don't need RVQ bins 9-32 with the 24Khz DAC model........ (time to requantize my audio...) --- vall_e/emb/qnt.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py index 30b8bdd..2983879 100755 --- a/vall_e/emb/qnt.py +++ b/vall_e/emb/qnt.py @@ -145,8 +145,6 @@ def _load_vocos_model(device="cuda", levels=cfg.model.max_levels): def _load_dac_model(device="cuda", levels=cfg.model.max_levels): kwargs = dict(model_type="44khz",model_bitrate="8kbps",tag="latest") - # just use the 44K model, the 24K model has 32 codebooks... - """ # yes there's a better way, something like f'{cfg.sample.rate//1000}hz' if cfg.sample_rate == 44_000: kwargs["model_type"] = "44kz" @@ -156,7 +154,6 @@ def _load_dac_model(device="cuda", levels=cfg.model.max_levels): kwargs["model_type"] = "16khz" else: raise Exception(f'unsupported sample rate: {cfg.sample_rate}') - """ model = __load_dac_model(**kwargs) model = model.to(device) @@ -168,7 +165,9 @@ def _load_dac_model(device="cuda", levels=cfg.model.max_levels): # updating it here will affect the sample rate the waveform is resampled to on encoding if cfg.variable_sample_rate: model.sample_rate = cfg.sample_rate + model.backend = "dac" + model.model_type = kwargs["model_type"] return model @@ -274,6 +273,11 @@ def encode(wav: Tensor, sr: int = cfg.sample_rate, device="cuda", levels=cfg.mod model = _load_dac_model(device, levels=levels) signal = AudioSignal(wav, sample_rate=sr) artifact = model.compress(signal, 5.0, verbose=False, n_quantizers=levels if isinstance(levels, int) else None) + + # trim to 8 codebooks if 24Khz + if model.model_type == "24khz": + artifact.codes = artifact.codes[:, :8, :] + return artifact.codes if not return_metadata else artifact # vocos does not encode wavs to encodecs, so just use normal encodec