From 069b27570f3f8f719a7290abda90bf00c465586b Mon Sep 17 00:00:00 2001 From: mrq Date: Sun, 17 Nov 2024 17:04:07 -0600 Subject: [PATCH] set option to set training masking ratio (I don't think for tts a fixed masking ratio is beneficial since the magic of the AR+NAR is being able to still reference the prior sequence of tokens for predicting things) --- vall_e/config.py | 5 ++++- vall_e/demo.py | 7 ------- vall_e/models/ar_nar.py | 5 +++++ vall_e/models/base.py | 30 ++++++++++++------------------ 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/vall_e/config.py b/vall_e/config.py index 07751c7..c3bea46 100755 --- a/vall_e/config.py +++ b/vall_e/config.py @@ -261,7 +261,7 @@ class ModelExperimentalSettings: masking_train_p: float = 0.0 # odds of training with masking masking_train_rvq_levels: list = field(default_factory=lambda: [0,0]) # determines which levels to do mask training on - masking_ratio_fixed: bool = True # this sets the masking ratio to a fixed 80% + masking_ratio: str | float = 0.0 # sets a masking ratio, "random" will randomly pick ignore_inputs_for_loss: bool = True # only calculate the loss on the outputs since thats what matters, as the inputs that do have loss calculated upon affects the loss for the entire sequence # classifier-free guidance shit @@ -981,6 +981,9 @@ class Config(BaseConfig): if "p_len_train" in model["experimental"]: del model["experimental"]["p_len_train"] + + if "masking_ratio_fixed" in model["experimental"]: + del model["experimental"]["masking_ratio_fixed"] self.models = [ Model(**model) if isinstance(model, dict) else model for model in self.models ] self.loras = [ LoRA(**lora) if isinstance(lora, dict) else lora for lora in self.loras ] diff --git a/vall_e/demo.py b/vall_e/demo.py index d1d9f59..61e2da7 100644 --- a/vall_e/demo.py +++ b/vall_e/demo.py @@ -55,16 +55,10 @@ def main(): parser.add_argument("--preamble", type=str, default=None) parser.add_argument("--output-filename", type=str, default="index.html") - parser.add_argument("--language", type=str, default="en") - parser.add_argument("--language", type=str, default="en") parser.add_argument("--task", type=str, default="tts") parser.add_argument("--out-path", type=Path, default=None) - parser.add_argument("--yaml", type=Path, default=None) - parser.add_argument("--model", type=Path, default=None) - parser.add_argument("--lora", type=Path, default=None) - parser.add_argument("--max-duration", type=int, default=12 * cfg.dataset.frames_per_second) parser.add_argument("--max-steps", type=int, default=25) parser.add_argument("--max-levels", type=int, default=7) @@ -362,7 +356,6 @@ def main(): text=text, references=[prompt], language=language, - input_prompt_length=args.input_prompt_length, seed=seed, tqdm=False, **sampling_kwargs, diff --git a/vall_e/models/ar_nar.py b/vall_e/models/ar_nar.py index a59da2b..a5eafd9 100644 --- a/vall_e/models/ar_nar.py +++ b/vall_e/models/ar_nar.py @@ -70,6 +70,7 @@ class AR_NAR(Base): cfg_prom_dropout_p = self.config.experimental.cfg_prom_dropout_p if self.config is not None else 0.0 # rate to train RVQ level AR-ly or NAR-ly masking_train_p = self.config.experimental.masking_train_p if self.config is not None else 0.5 + masking_ratio = self.config.experimental.masking_ratio if self.config is not None else "random" # force set mask training if "len" not in self.capabilities: masking_train_p = 0.0 @@ -108,6 +109,10 @@ class AR_NAR(Base): #p = math.acos(r) / (math.pi * 0.5) #timesteps[i] = 1.0 - clamp(p, 0.0, 1.0) timesteps[i] = random.random() + + # instead make it between [0.2, 0.8] + if masking_ratio == "rand": + timesteps[i] = (timesteps[i] * 0.6) + 0.2 # trim resps to only contain all levels below the target level resps_list = [r if t in text_task else r[..., :l+1] for r, l, t in zip(resps_list, quant_levels, task_list)] diff --git a/vall_e/models/base.py b/vall_e/models/base.py index 3d534fc..90df472 100755 --- a/vall_e/models/base.py +++ b/vall_e/models/base.py @@ -436,7 +436,7 @@ class Base(nn.Module): unified_position_ids = self.config.experimental.unified_position_ids if self.config is not None else True interleave = self.config.experimental.interleave if self.config is not None else False - masking_ratio_fixed = self.config.experimental.masking_ratio_fixed if self.config is not None else False + masking_ratio = self.config.experimental.masking_ratio if self.config is not None else False ignore_inputs_for_loss = self.config.experimental.ignore_inputs_for_loss if self.config is not None else False layerskip = self.config.experimental.layerskip if self.config is not None else False @@ -481,7 +481,7 @@ class Base(nn.Module): self.interleave = interleave self.layerskip = layerskip self.inject_timestep_embedding = False # results in bad output - self.masking_ratio_fixed = masking_ratio_fixed + self.masking_ratio = masking_ratio self.ignore_inputs_for_loss = ignore_inputs_for_loss self.text_emb = Embedding(n_text_tokens, d_model) @@ -537,7 +537,7 @@ class Base(nn.Module): # experimental NAR-only mode self.len_emb = Embedding(11, d_model) - self.time_emb = TimeEmbedding(d_model) # if not masking_ratio_fixed else None + self.time_emb = TimeEmbedding(d_model) # if not masking_ratio else None if attention_backend == "auto": attention_backend = "sdpa" @@ -840,7 +840,6 @@ class Base(nn.Module): state = None, layer_skip_lambda = None, - timesteps = None, output_attentions = False, output_hidden_states = False, @@ -871,9 +870,6 @@ class Base(nn.Module): if self.layerskip and layer_skip_lambda is not None: kwargs["layer_skip_lambda"] = layer_skip_lambda - if "len" in self.capabilities and timesteps is not None: - kwargs["timesteps"] = timesteps - output = self.model(**kwargs) x = output["last_hidden_state"] @@ -1012,13 +1008,18 @@ class Base(nn.Module): if timestep is not None: # force set to use this classifier level classifier_level = "NAR:0:0" - # a paper said to use a fixed masking ratio for training - p = 0.8 # store timestep information - if not self.masking_ratio_fixed: + if self.masking_ratio in ["random", "rand"]: # cosine scheduled timestep => masking ratio p = math.cos(timestep * math.pi * 0.5) - inputs[i].append( ("timestep", torch.tensor([timestep], device=device, dtype=self.time_emb.mlp[0].weight.dtype) ) ) + # I don't think is is necessary as the timestep is encoded in the sequence by the number of masked tokens, probably. + if self.inject_timestep_embedding: + inputs[i].append( ("timestep", torch.tensor([timestep], device=device, dtype=self.time_emb.mlp[0].weight.dtype) ) ) + else: + # a paper said to use a fixed masking ratio of 0.8 for training + # ...but I want to make it user adjustable + p = self.masking_ratio + # store dropout mask (if training, as this gets used later to mask the input embeddings if provided) if self.training: dropout_mask = _dropout_mask( resps_list[i], p ) @@ -1597,12 +1598,6 @@ class Base(nn.Module): position_ids = self.inputs_to_position_ids( inputs, mask=mask ) if not self.unified_position_ids else None classifier_levels = self.get_input( inputs, name="classifier_level" ) - if self.inject_timestep_embedding: - timesteps = [ self.get_input(inputs, "timestep", at=i) for i in range( batch_size ) ] - timesteps = [ self.time_emb(timestep) if timestep is not None else None for i, timestep in enumerate(timesteps) ] - else: - timesteps = [] - output = self._forward( inputs=x, mask=mask, @@ -1611,7 +1606,6 @@ class Base(nn.Module): output_attentions = output_attentions, output_hidden_states = output_hidden_states, layer_skip_lambda = layer_skip_lambda if self.layerskip and layer_skip_variables else None, - timesteps=timesteps, ) logits = output.logits