From 84a05acb6dd044b1fa0fcac02e6224b79efe447c Mon Sep 17 00:00:00 2001 From: mrq Date: Mon, 2 Dec 2024 19:10:42 -0600 Subject: [PATCH] touch ups in docs --- docs/README.md | 25 ++++++++++++++++++++++--- vall_e/demo.py | 9 +++++++++ vall_e/models/arch/mamba.py | 5 +++++ vall_e/models/base.py | 4 +--- 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/docs/README.md b/docs/README.md index f1e6afb..713e11c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -42,10 +42,12 @@ However, at this point and time, the implementation is rather divorced from VALL * [ ] audio streaming - this *technically* can work without any additional architecture changes, just clever tricks with sampling-then-decoding-to-audio. - something similar to HiFiGAN (or the one for TorToiSe) trained on the last hidden states of the AR *might* also enable an alternate way for streaming. -* [ ] speed up inferencing +* [ ] speed up inferencing for the AR - KV caching both yields broken output and quadratically slow output, unless I'm doing something grossly wrong. - - A pure HF model is the only way to fix this, but converting the model to one is a bit of a chore. * [x] provide a pure NAR model that foregoes most of the inferencing slowdowns a regular AR+NAR model will provide. +* [ ] HF-ify the model + - this might be easily possible by subjugating the tokenizer to handle all the embeddings / classifiers + - this will pave the way to use the model under an easy marriage of `llama.cpp` and `encodec.cpp` * [ ] replace the phonemizer with something that doesn't depend on espeak * [ ] train the model to handle text => phoneme (without a hit to the rest of the model) * [ ] ...and phonemes => text @@ -62,9 +64,26 @@ However, at this point and time, the implementation is rather divorced from VALL * mixing multiple speakers through summing input prompt embeddings * I do not expect this to work, but you never know... +## "Postmortem" + +For the most part, the model is complete. With the `NAR-len` being crammed on, I'm satisifed with the performance-to-quality. + +However, while this solution boasts being lightweight, there are some caveats for its given size +* its at capacity on what it *can* do without additional tasks to augment it further + * post-fixing it with additional layers glued on doesn't seem to offer very much work (12 => 16 layers) +* wrangling it is a bit of a chore, as some voices work fine under the `AR` but not the `NAR-len`, and vice-versa + * some voices outright refuse to work without LoRA training + * some sampler settings works on some voices, but others need some tweaking +* for short durations, it excels, but despite training on longer durations, stability is less guaranteed +* subjugating an existing LLM architecture is a bit of a pain, as I would *love* to make full use of LLaMA niceties + * `hf`-ifying it is possible, but it'd be a chore to set up the tokenizer properly +* it still seems like the phase of the moon matters with how it wants to cooperate + * some eval tests it seems fine, other times issues like word errors will crop up + + ## Notices and Citations -Unless otherwise credited/noted in this repo or within the designated Python file, this repository is [licensed](LICENSE) under AGPLv3. +Unless otherwise credited/noted in this repo or within the designated Python file, this repository is [licensed](/LICENSE) under AGPLv3. - [EnCodec](https://github.com/facebookresearch/encodec) is licensed under CC-BY-NC 4.0. If you use the code to generate audio quantization or perform decoding, it is important to adhere to the terms of their license. diff --git a/vall_e/demo.py b/vall_e/demo.py index 9c6e9f9..603a61a 100644 --- a/vall_e/demo.py +++ b/vall_e/demo.py @@ -216,6 +216,15 @@ def main(): comparison_kwargs["disabled"]["amp"] = current_amp comparison_kwargs["enabled"]["amp"] = other_amp + elif args.comparison == "modality": + comparison_kwargs["suffix"] = "modality" + comparison_kwargs["titles"] = [f"AR+NAR", f"NAR-len"] + + comparison_kwargs["disabled"]["modality"] = "ar+nar" + comparison_kwargs["disabled"]["cfg_strength"] = 0.0 + + comparison_kwargs["enabled"]["modality"] = "nar-len" + comparison_kwargs["enabled"]["cfg_strength"] = 3.0 elif args.comparison == "cfg-strength": current_cfg_strength = 3.0 other_cfg_strength = 0.0 diff --git a/vall_e/models/arch/mamba.py b/vall_e/models/arch/mamba.py index 15cd8c9..5e9b956 100644 --- a/vall_e/models/arch/mamba.py +++ b/vall_e/models/arch/mamba.py @@ -7,8 +7,13 @@ from transformers.models.mamba2.modeling_mamba2 import Mamba2Model from transformers.models.mamba2.configuration_mamba2 import Mamba2Config """ +""" from mamba2_torch.modeling.configuration_mamba2 import Mamba2Config from mamba2_torch.modeling.modeling_mamba2 import Mamba2Model +""" + +from fla.models.mamba2.configuration_mamba2 import Mamba2Config +from fla.models.mamba2.modeling_mamba2 import Mamba2Model """ # https://github.com/state-spaces/mamba diff --git a/vall_e/models/base.py b/vall_e/models/base.py index 5ece310..b08e25c 100755 --- a/vall_e/models/base.py +++ b/vall_e/models/base.py @@ -851,8 +851,8 @@ class Base(nn.Module): aux_loss = torch.sum(torch.stack([ t for t in _["l_aux"] if t is not None])) * 0.001 elif self.arch_type in ["mamba","mamba2"]: kwargs = dict( - #attention_mask=m, inputs_embeds=x, + attention_mask=m, #cache_params=state, use_cache=False, # not self.training, #position_ids=position_ids, @@ -864,8 +864,6 @@ class Base(nn.Module): output = self.model(**kwargs) x = output["last_hidden_state"] - # to-do: figure out why KV caching doesn't work - #if not self.training: if state is not None: state = output["cache_params"]