correcting my wrong of assuming I could just use raw 24Khz audio in the 44Khz DAC without too much of an issue (there are issues)

2024-05-04 23:49:15 -05:00 · 2024-05-04 23:49:15 -05:00 · 215800484d
commit 215800484d
parent 9f738fbd5b
3 changed files with 27 additions and 14 deletions
--- a/data/qnt.dac
+++ b/data/qnt.dac
--- a/vall_e/config.py
+++ b/vall_e/config.py
@ -157,13 +157,24 @@ class Dataset:
 	tasks_list: list[str] = field(default_factory=lambda: ["tts"])
-	_frames_per_second: int = 0 # in encodec, each frame is 75 codes, in dac, each frame is 41
+	_frames_per_second: int = 0 # allows setting your own hint
 	@cached_property
 	def frames_per_second(self):
 		if self._frames_per_second > 0:
 			return self._frames_per_second
-		return 41 if cfg.inference.audio_backend == "dac" else 75
+
 		if cfg.inference.audio_backend == "dac":
 			# using the 44KHz model with 24KHz sources has a frame rate of 41Hz
 			if cfg.variable_sample_rate and cfg.sample_rate == 24_000:
 				return 41
 			if cfg.sample_rate == 44_000:
 				return 86
 			if cfg.sample_rate == 16_000:
 				return 50
 		# 24Khz Encodec / Vocos and incidentally DAC are all at 75Hz
 		return 75
 	@property
 	def min_phones(self):
@ -562,7 +573,7 @@ class Config(_Config):
 	tokenizer: str = "./tokenizer.json"
 	sample_rate: int = 24_000
-	variable_sample_rate: bool = True # for DAC, this will override the model automatically resampling to 44KHz.
+	variable_sample_rate: bool = False # NOT recommended, as running directly 24Khz audio in the 44Khz DAC model will have detrimental quality loss
 	@property
 	def distributed(self):
--- a/vall_e/emb/qnt.py
+++ b/vall_e/emb/qnt.py
@ -143,8 +143,9 @@ def _load_vocos_model(device="cuda", levels=cfg.model.max_levels):
@cache
 def _load_dac_model(device="cuda", levels=cfg.model.max_levels):
-	kwargs = dict(model_type="44khz",model_bitrate="8kbps",tag="latest")
+	kwargs = dict(model_type="24khz",model_bitrate="8kbps",tag="latest")
-
+	"""
 	if not cfg.variable_sample_rate:
 		# yes there's a better way, something like f'{cfg.sample.rate//1000}hz'
 		if cfg.sample_rate == 44_000:
 			kwargs["model_type"] = "44kz"
@ -154,6 +155,7 @@ def _load_dac_model(device="cuda", levels=cfg.model.max_levels):
 			kwargs["model_type"] = "16khz"
 		else:
 			raise Exception(f'unsupported sample rate: {cfg.sample_rate}')
 	"""
 	model = __load_dac_model(**kwargs)
 	model = model.to(device)