From 2542ed067dd07cdc5a39d787969040d7cc740870 Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 21 Dec 2024 19:59:56 -0600 Subject: [PATCH] ugh --- vall_e.cpp/README.md | 10 ++++++---- vall_e.cpp/vall_e.cpp | 29 +++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/vall_e.cpp/README.md b/vall_e.cpp/README.md index a5e63d4..30a3cf9 100644 --- a/vall_e.cpp/README.md +++ b/vall_e.cpp/README.md @@ -17,20 +17,22 @@ Run `make`. ## To-Do * [x] converted model to GGUF - * [ ] convert it without modifying any of the existing code + * [ ] convert it without modifying any of the existing code, as the tokenizer requires some care * [x] basic framework * [x] load the quantized model * [x] orchestrate the required embeddings * [x] juggle the output head / classifier properly * [ ] phonemize text + * with the help of espeak-ng * [ ] tokenize phonemes + * the tokenizer is being a huge thorn on actual sequences * [x] load audio from disk * [x] encode audio * [x] sum embeddings for the `prom` and prior `resp`s * [x] `AR` sampling * [ ] `NAR-len` demasking sampling -* [ ] `NAR` sampling -* [ ] decode audio to disk +* [x] `NAR` sampling +* [x] decode audio to disk * [ ] a functional CLI * [ ] actually make it work - * it seems naively stitching the model together isn't good enough since the output is wrong \ No newline at end of file + * it seems naively stitching the model together isn't good enough since the output is wrong, it most likely needs training with a glued together classifier \ No newline at end of file diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp index 90ee430..9edd8c0 100644 --- a/vall_e.cpp/vall_e.cpp +++ b/vall_e.cpp/vall_e.cpp @@ -19,7 +19,8 @@ struct input_t { std::string task = "tts"; - std::vector phonemes = {}; + std::string phonemes = ""; + std::vector phn = {}; llama_token lang = 0; llama_token rvq_l = 0; std::vector> prom = {}; @@ -297,7 +298,7 @@ void fill_batch( llama_batch& batch, input_t& input, embeddings_t& embeddings_ma auto n_embd = embeddings_map.n_embd; // insert text tokens - for ( auto& id : input.phonemes ) batch_add( batch, id, n_embd, embeddings_map.text_embds, pos++, false ); + for ( auto& id : input.phn ) batch_add( batch, id, n_embd, embeddings_map.text_embds, pos++, false ); batch_add( batch, 0, n_embd, embeddings_map.sep_embd, pos++, false ); pos = 0; // insert lang token @@ -350,7 +351,7 @@ std::vector generate( llama_context* ctx, llama_model* model, llama if ( batch.logits[i] ) ++n_logits; } - if ( verbose ) printf("Prompt size: %i | Logits: %i\n", batch.n_tokens, n_logits); + if ( verbose ) printf("Prompt size: %i | Outputs: %i\n", batch.n_tokens, n_logits); // NAR mode, cap at one step if ( n_logits > 1 ) { @@ -379,8 +380,8 @@ std::vector generate( llama_context* ctx, llama_model* model, llama stop_token = embeddings_map.resp_embd_start[2] - 1; // <|NAR|0:STOP|> } else if ( mode == INFERENCE_MODE_NAR ) { - logit_range[0] = embeddings_map.resp_embd_start[2+rvq_l]; - logit_range[1] = embeddings_map.resp_embd_start[3+rvq_l]; + logit_range[0] = embeddings_map.resp_embd_start[2+rvq_l-1]; + logit_range[1] = embeddings_map.resp_embd_start[3+rvq_l-1]; embds = embeddings_map.resps_embds[2]; } else if ( mode == INFERENCE_MODE_LEN ) { @@ -460,7 +461,8 @@ int main(int argc, char ** argv) { input_t input{}; embeddings_t embeddings_map{}; - input.phonemes = {1,85,4,128,26,4,186,4,89,33,25,4,48,4,134,25,52,86,4,34,97,27,11,2}; // hˈɛloː ʋˈɔrlt + // input.phonemes = "hˈɛloː ʋˈɔrlt"; + input.phn = {1,85,4,128,26,4,186,4,89,33,25,4,48,4,134,25,52,86,4,34,97,27,11,2}; // hˈɛloː ʋˈɔrlt std::string vall_e_model_path = "./data/vall_e-F16.gguf"; std::string encodec_model_path = "./data/encodec.bin"; @@ -535,6 +537,21 @@ int main(int argc, char ** argv) { // update mapping embeddings_map.init( n_embd, n_vocab, embds.data() ); + // tokenize phonemes + // to-do: make this work, the vocab does not work + if ( input.phonemes != "" ) { + const int n_prompt = -llama_tokenize(model, input.phonemes.c_str(), input.phonemes.size(), NULL, 0, true, true); + // allocate space for the tokens and tokenize the input.phonemes + input.phns.resize(n_prompt) + if (llama_tokenize(model, input.phonemes.c_str(), input.phonemes.size(), input.phns.data(), input.phns.size(), true, true) < 0) { + fprintf(stderr, "%s: error: failed to tokenize: %s\n", __func__, input.phonemes.c_str()); + return 1; + } + + for ( auto& token : input.phns ) printf("%i ", token ); + printf("\n"); + } + // inference std::vector output_tokens; // NAR-len demasking