diff --git a/data/qnt.dac b/data/qnt.dac new file mode 100644 index 0000000..bb6a3e9 Binary files /dev/null and b/data/qnt.dac differ diff --git a/vall_e/config.py b/vall_e/config.py index 363e003..e9a1dfa 100755 --- a/vall_e/config.py +++ b/vall_e/config.py @@ -318,7 +318,6 @@ class Model: name: str = "ar+nar" # vanity name for the model version: int = 5 # 1 = old with MultiEmbedding, 2 = new with AudioEmbedding, 3+ = additional embeddings size: str | dict = "full" # preset string or explicitly defined dimensionality - resp_levels: int = 8 # RVQ-bin levels this model supports tasks: int = 8 # ["tts", "ns", "sr", "tse", "cse", "nse"] and leaves two more for anything else I want (like "svc") (unused) langs: int = 1 # defined languages (semi-unused) tones: int = 1 # defined tones (unsued) @@ -382,6 +381,16 @@ class Model: def tokens(self): return self.audio_tokens + @property + def resp_levels(self): + if isinstance(self.size, dict) and "resp_levels" in self.size: + return self.size['resp_levels'] + + if cfg.audio_backend == "dac": + return 9 + + return 8 + @property def audio_tokens(self): if isinstance(self.size, dict) and "audio_tokens" in self.size: @@ -1043,6 +1052,10 @@ class Config(BaseConfig): if not isinstance( model, dict ): continue + # was made an inherent property tied to audio_backend + if "resp_levels" in model: + del model["resp_levels"] + # to-do: prune unused keys in here too automatically if "experimental" not in model or not model["experimental"]: model["experimental"] = {} diff --git a/vall_e/emb/codecs/dac.py b/vall_e/emb/codecs/dac.py index ff6f660..0eddcd2 100644 --- a/vall_e/emb/codecs/dac.py +++ b/vall_e/emb/codecs/dac.py @@ -2,7 +2,7 @@ import torch from dac import DACFile from audiotools import AudioSignal -from dac.utils import load_model as __load_dac_model +from dac.utils import load_model as load_dac_model from typing import Union from pathlib import Path diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py index 458d332..7242a8c 100755 --- a/vall_e/emb/qnt.py +++ b/vall_e/emb/qnt.py @@ -117,7 +117,7 @@ def _load_dac_model(device="cuda", dtype=None): else: raise Exception(f'unsupported sample rate: {cfg.sample_rate}') - model = __load_dac_model(**kwargs) + model = load_dac_model(**kwargs) model = model.to(device) model = model.eval() diff --git a/vall_e/models/ar_nar_v2.py b/vall_e/models/ar_nar_v2.py index a93e3c1..369ee1f 100644 --- a/vall_e/models/ar_nar_v2.py +++ b/vall_e/models/ar_nar_v2.py @@ -828,7 +828,7 @@ def example_usage(): available_tasks = ["tts-nar"] model = AR_NAR_V2(**kwargs).to(cfg.device) - steps = 500 // batch_size + steps = 250 # // batch_size optimizer = cfg.hyperparameters.optimizer.lower() if cfg.yaml_path is not None else "prodigy" scheduler = cfg.hyperparameters.scheduler.lower() if cfg.yaml_path is not None else ""