correcting my wrong of assuming I could just use raw 24Khz audio in the 44Khz DAC without too much of an issue (there are issues)

This commit is contained in:
mrq 2024-05-04 23:49:15 -05:00
parent 9f738fbd5b
commit 215800484d
3 changed files with 27 additions and 14 deletions

Binary file not shown.

View File

@ -157,13 +157,24 @@ class Dataset:
tasks_list: list[str] = field(default_factory=lambda: ["tts"])
_frames_per_second: int = 0 # in encodec, each frame is 75 codes, in dac, each frame is 41
_frames_per_second: int = 0 # allows setting your own hint
@cached_property
def frames_per_second(self):
if self._frames_per_second > 0:
return self._frames_per_second
return 41 if cfg.inference.audio_backend == "dac" else 75
if cfg.inference.audio_backend == "dac":
# using the 44KHz model with 24KHz sources has a frame rate of 41Hz
if cfg.variable_sample_rate and cfg.sample_rate == 24_000:
return 41
if cfg.sample_rate == 44_000:
return 86
if cfg.sample_rate == 16_000:
return 50
# 24Khz Encodec / Vocos and incidentally DAC are all at 75Hz
return 75
@property
def min_phones(self):
@ -562,7 +573,7 @@ class Config(_Config):
tokenizer: str = "./tokenizer.json"
sample_rate: int = 24_000
variable_sample_rate: bool = True # for DAC, this will override the model automatically resampling to 44KHz.
variable_sample_rate: bool = False # NOT recommended, as running directly 24Khz audio in the 44Khz DAC model will have detrimental quality loss
@property
def distributed(self):

View File

@ -143,17 +143,19 @@ def _load_vocos_model(device="cuda", levels=cfg.model.max_levels):
@cache
def _load_dac_model(device="cuda", levels=cfg.model.max_levels):
kwargs = dict(model_type="44khz",model_bitrate="8kbps",tag="latest")
# yes there's a better way, something like f'{cfg.sample.rate//1000}hz'
if cfg.sample_rate == 44_000:
kwargs["model_type"] = "44kz"
elif cfg.sample_rate == 24_000:
kwargs["model_type"] = "24khz"
elif cfg.sample_rate == 16_000:
kwargs["model_type"] = "16khz"
else:
raise Exception(f'unsupported sample rate: {cfg.sample_rate}')
kwargs = dict(model_type="24khz",model_bitrate="8kbps",tag="latest")
"""
if not cfg.variable_sample_rate:
# yes there's a better way, something like f'{cfg.sample.rate//1000}hz'
if cfg.sample_rate == 44_000:
kwargs["model_type"] = "44kz"
elif cfg.sample_rate == 24_000:
kwargs["model_type"] = "24khz"
elif cfg.sample_rate == 16_000:
kwargs["model_type"] = "16khz"
else:
raise Exception(f'unsupported sample rate: {cfg.sample_rate}')
"""
model = __load_dac_model(**kwargs)
model = model.to(device)