diff --git a/data/qnt.dac b/data/qnt.dac index f4594c1..20ba61a 100644 Binary files a/data/qnt.dac and b/data/qnt.dac differ diff --git a/vall_e/config.py b/vall_e/config.py index 3a06730..38f775e 100755 --- a/vall_e/config.py +++ b/vall_e/config.py @@ -157,13 +157,24 @@ class Dataset: tasks_list: list[str] = field(default_factory=lambda: ["tts"]) - _frames_per_second: int = 0 # in encodec, each frame is 75 codes, in dac, each frame is 41 + _frames_per_second: int = 0 # allows setting your own hint @cached_property def frames_per_second(self): if self._frames_per_second > 0: return self._frames_per_second - return 41 if cfg.inference.audio_backend == "dac" else 75 + + if cfg.inference.audio_backend == "dac": + # using the 44KHz model with 24KHz sources has a frame rate of 41Hz + if cfg.variable_sample_rate and cfg.sample_rate == 24_000: + return 41 + if cfg.sample_rate == 44_000: + return 86 + if cfg.sample_rate == 16_000: + return 50 + + # 24Khz Encodec / Vocos and incidentally DAC are all at 75Hz + return 75 @property def min_phones(self): @@ -562,7 +573,7 @@ class Config(_Config): tokenizer: str = "./tokenizer.json" sample_rate: int = 24_000 - variable_sample_rate: bool = True # for DAC, this will override the model automatically resampling to 44KHz. + variable_sample_rate: bool = False # NOT recommended, as running directly 24Khz audio in the 44Khz DAC model will have detrimental quality loss @property def distributed(self): diff --git a/vall_e/emb/qnt.py b/vall_e/emb/qnt.py index 2983879..5b43f6c 100755 --- a/vall_e/emb/qnt.py +++ b/vall_e/emb/qnt.py @@ -143,17 +143,19 @@ def _load_vocos_model(device="cuda", levels=cfg.model.max_levels): @cache def _load_dac_model(device="cuda", levels=cfg.model.max_levels): - kwargs = dict(model_type="44khz",model_bitrate="8kbps",tag="latest") - - # yes there's a better way, something like f'{cfg.sample.rate//1000}hz' - if cfg.sample_rate == 44_000: - kwargs["model_type"] = "44kz" - elif cfg.sample_rate == 24_000: - kwargs["model_type"] = "24khz" - elif cfg.sample_rate == 16_000: - kwargs["model_type"] = "16khz" - else: - raise Exception(f'unsupported sample rate: {cfg.sample_rate}') + kwargs = dict(model_type="24khz",model_bitrate="8kbps",tag="latest") + """ + if not cfg.variable_sample_rate: + # yes there's a better way, something like f'{cfg.sample.rate//1000}hz' + if cfg.sample_rate == 44_000: + kwargs["model_type"] = "44kz" + elif cfg.sample_rate == 24_000: + kwargs["model_type"] = "24khz" + elif cfg.sample_rate == 16_000: + kwargs["model_type"] = "16khz" + else: + raise Exception(f'unsupported sample rate: {cfg.sample_rate}') + """ model = __load_dac_model(**kwargs) model = model.to(device)