From 3d63349d96a9633515b83060ae37fdb2bf144e22 Mon Sep 17 00:00:00 2001 From: mrq Date: Mon, 23 Dec 2024 21:56:19 -0600 Subject: [PATCH] more fixes... --- vall_e.cpp/README.md | 4 +- vall_e.cpp/vall_e.cpp | 169 ++++++++++++++++++++---------------------- vall_e.cpp/vall_e.h | 14 ++-- 3 files changed, 91 insertions(+), 96 deletions(-) diff --git a/vall_e.cpp/README.md b/vall_e.cpp/README.md index c8158d3..914783a 100644 --- a/vall_e.cpp/README.md +++ b/vall_e.cpp/README.md @@ -14,9 +14,9 @@ Run `make`. ### Required Modifications -[`encodec.cpp`](https://github.com/e-c-k-e-r/encodec.cpp) requires updating its GGML copy to the latest version, which requires a few lines to get the CPU backend working. +[`encodec.cpp`](https://github.com/PABannier/encodec.cpp) requires updating its GGML copy to the latest version, which requires a few lines to get the CPU backend working (per my [fork](https://github.com/e-c-k-e-r/encodec.cpp)). -[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) only possible modification needs to ensure that a non-causal attention mask is used; everything necessary can be hacked together with clever tricks. +[`llama.cpp`](https://github.com/ggerganov/llama.cpp) only possible modification needs to ensure that a non-causal attention mask is used; everything necessary can be hacked together with clever tricks. ## To-Do diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp index 33bea36..9108219 100644 --- a/vall_e.cpp/vall_e.cpp +++ b/vall_e.cpp/vall_e.cpp @@ -87,17 +87,13 @@ ggml_tensor* VALL_E_API view_2d_tensor( struct ggml_context* ctx, struct ggml_te return res; } - -struct ggml_tensor * VALL_E_API vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) { - return userdata.prom_embds[idx]; +void VALL_E_API print_tokens( const std::vector& tokens ) { + printf("["); + for ( auto i = 0; i < tokens.size(); ++i ) { + printf("%i%s", tokens[i], i + 1 < tokens.size() ? ", " : ""); + } + printf("]\n"); } -struct ggml_tensor * VALL_E_API vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) { - return userdata.resp_embds[idx]; -} -struct ggml_tensor * VALL_E_API vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) { - return userdata.aux_embds[idx]; -} - const io_t& VALL_E_API vall_e_inputs_map_get( io_map_t& io_map, const std::string& name ) { return io_map.io[name]; @@ -139,32 +135,32 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) { io_map.io[entry.name].head = entry.head_idx < 0 ? NULL : userdata.heads[entry.head_idx]; } - io_map.io["text"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 0)); - io_map.io["rvq_l"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 1)); - io_map.io["lang"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 2)); - io_map.io["task"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 3)); - io_map.io["len"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 4)); - io_map.io["tone"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 5)); - io_map.io["sep"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 6)); + io_map.io["text"].embds = read_2d_tensor(userdata.aux_embds[0]); + io_map.io["rvq_l"].embds = read_2d_tensor(userdata.aux_embds[1]); + io_map.io["lang"].embds = read_2d_tensor(userdata.aux_embds[2]); + io_map.io["task"].embds = read_2d_tensor(userdata.aux_embds[3]); + io_map.io["len"].embds = read_2d_tensor(userdata.aux_embds[4]); + io_map.io["tone"].embds = read_2d_tensor(userdata.aux_embds[5]); + io_map.io["sep"].embds = read_2d_tensor(userdata.aux_embds[6]); - io_map.io["prom|0"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 0)); - io_map.io["prom|1"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 1)); - io_map.io["prom|2"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 2)); - io_map.io["prom|3"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 3)); - io_map.io["prom|4"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 4)); - io_map.io["prom|5"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 5)); - io_map.io["prom|6"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 6)); - io_map.io["prom|7"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 7)); + io_map.io["prom|0"].embds = read_2d_tensor(userdata.prom_embds[0]); + io_map.io["prom|1"].embds = read_2d_tensor(userdata.prom_embds[1]); + io_map.io["prom|2"].embds = read_2d_tensor(userdata.prom_embds[2]); + io_map.io["prom|3"].embds = read_2d_tensor(userdata.prom_embds[3]); + io_map.io["prom|4"].embds = read_2d_tensor(userdata.prom_embds[4]); + io_map.io["prom|5"].embds = read_2d_tensor(userdata.prom_embds[5]); + io_map.io["prom|6"].embds = read_2d_tensor(userdata.prom_embds[6]); + io_map.io["prom|7"].embds = read_2d_tensor(userdata.prom_embds[7]); - io_map.io["resps|AR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 0)); - io_map.io["resps|NAR:0:1"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 1)); - io_map.io["resps|NAR:1:2"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 2)); - io_map.io["resps|NAR:2:3"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 3)); - io_map.io["resps|NAR:3:4"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 4)); - io_map.io["resps|NAR:4:5"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 5)); - io_map.io["resps|NAR:5:6"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 6)); - io_map.io["resps|NAR:6:7"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 7)); - io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 8)); + io_map.io["resps|AR:0:0"].embds = read_2d_tensor(userdata.resp_embds[0]); + io_map.io["resps|NAR:0:1"].embds = read_2d_tensor(userdata.resp_embds[1]); + io_map.io["resps|NAR:1:2"].embds = read_2d_tensor(userdata.resp_embds[2]); + io_map.io["resps|NAR:2:3"].embds = read_2d_tensor(userdata.resp_embds[3]); + io_map.io["resps|NAR:3:4"].embds = read_2d_tensor(userdata.resp_embds[4]); + io_map.io["resps|NAR:4:5"].embds = read_2d_tensor(userdata.resp_embds[5]); + io_map.io["resps|NAR:5:6"].embds = read_2d_tensor(userdata.resp_embds[6]); + io_map.io["resps|NAR:6:7"].embds = read_2d_tensor(userdata.resp_embds[7]); + io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(userdata.resp_embds[8]); /* @@ -505,16 +501,12 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m // if INFERENCE_MODE_AR || INFERENCE_MODE_LEN if ( causal ) { output_tokens.reserve(max_tokens); - if ( verbose ) { - printf("["); - fflush(stdout); - } while ( output_tokens.size() < max_tokens ) { if ( llama_decode(ctx, batch) ) { fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return output_tokens; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(ctx); // necessary for many reasons // sample token auto t = llama_sampler_sample(smpl, ctx, -1); @@ -528,21 +520,15 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m output_tokens.emplace_back(t); // update batch with token batch_add( batch, t, io_map.n_embd, embds, output_tokens.size(), true ); - if ( verbose ) { - printf("%i, ", t); - fflush(stdout); - } - } - if ( verbose ) { - printf("]\n"); - fflush(stdout); + + if ( verbose ) print_tokens( output_tokens ); } } else if ( mode == INFERENCE_MODE_NAR_DEMASK ) { // to-do: assert n_outputs == input.resp[rvq_l-1].size() const llama_token MASK_TOKEN = 1024; // token value for masking const float PI = 3.141592653589793f; // to-do: derive from sampling arguments - int32_t steps = 30; // number of demasking steps + int32_t steps = 10; // number of demasking steps int32_t seq_len = n_outputs; float temperature = 1.5f; float cfg_strength = 2.5f; @@ -563,25 +549,37 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m // do one step on many tokens for ( auto step = 0; step < steps; ++step ) { - if ( verbose ) { - printf("[%i/%i] [", step, steps); - fflush(stdout); - } - - float timestep = (step+1) / steps; // to-do: align with torch.linspace + float timestep = ((float)step) / steps; // to-do: align with torch.linspace + float annealing = 1.0f - timestep; - float noise_p = cos( timestep * PI * 0.5f ); - float remask_p = 0.5f / steps; - int32_t n_masked_tokens = std::min(int(noise_p * seq_len), 1); + float sampling_temperature = temperature * annealing; float sampling_cfg_strength = timestep * cfg_strength; - std::vector is_masked(n_outputs, false); - std::vector masked_indices; - masked_indices.reserve(n_masked_tokens); - std::vector sorted = scores; + float noise_p = cos( timestep * PI * 0.5f ); + float remask_p = 0.0f; // 0.5f / steps; + + int32_t n_masked_tokens = (noise_p + remask_p) * seq_len; + if ( n_masked_tokens < 1 ) { + n_masked_tokens = 1; + } + if ( n_masked_tokens > n_outputs ) { + n_masked_tokens = n_outputs; + } + + // sort previous scores + std::vector sorted( n_outputs ); + for ( auto i = 0; i < n_outputs; ++i ) { + sorted[i] = { i, scores[i] }; + } std::sort(sorted.begin(), sorted.end()); - masked_indices.insert( masked_indices.end(), sorted.begin(), sorted.begin() + n_masked_tokens ); + // and top-k pick the worst scores + std::vector masked_indices( n_masked_tokens ); + for ( auto i = 0; i < n_masked_tokens; ++i ) { + masked_indices[i] = sorted[i].idx; + } + + std::vector is_masked(n_outputs, false); // mask off tokens for ( auto& idx : masked_indices ) { @@ -592,6 +590,8 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m is_masked[i] = output_tokens[i] == MASK_TOKEN; } + if ( verbose ) print_tokens( output_tokens ); + // update batch // to-do: only update the embeddings instead batch.n_tokens = 0; @@ -602,14 +602,12 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m null_batch.n_tokens = 0; fill_batch( null_batch, input, io_map, mode ); - // to-do: update sampling temperature - // cfg decode if ( llama_decode(ctx, null_batch) ) { fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return output_tokens; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(ctx); // necessary for many reasons // copy null probabilities std::vector null_logits(n_outputs * n_vocab, -INFINITY); // to-do: copy once @@ -622,7 +620,17 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return output_tokens; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(ctx); // necessary for many reasons + + auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; + llama_sampler * smpl = llama_sampler_chain_init(sparams); + + llama_sampler_chain_add(smpl, llama_sampler_init_top_k(0)); + llama_sampler_chain_add(smpl, llama_sampler_init_top_p(1.0, 1)); + llama_sampler_chain_add(smpl, llama_sampler_init_temp (sampling_temperature)); + llama_sampler_chain_add(smpl, llama_sampler_init_dist (1130)); + // to-do: figure out why all logits are the same for each token...... // "reverse" iterate from backwards indexing for ( auto idx = 0; idx < n_outputs; ++idx ) { @@ -645,18 +653,12 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m // store token if it was masked output_tokens[idx] = t; // update score if it was masked - scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later - if ( verbose ) { - printf("%i, ", t); - fflush(stdout); - } - - } - if ( verbose ) { - printf("\n"); - fflush(stdout); + scores[idx] = softmaxed[t]; // invert so we pick the worst tokens later } + llama_sampler_free(smpl); + + if ( verbose ) print_tokens( output_tokens ); } } else if ( mode == INFERENCE_MODE_NAR ) { // to-do: assert n_outputs == input.resp[rvq_l-1].size() @@ -666,28 +668,17 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return output_tokens; } - llama_kv_cache_clear(ctx); + llama_kv_cache_clear(ctx); // necessary for many reasons // to-do: figure out why all logits are the same for each token...... // "reverse" iterate from backwards indexing - if ( verbose ) { - printf("["); - fflush(stdout); - } for ( auto idx = 0; idx < n_outputs; ++idx ) { // sample ith token auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx); // store token output_tokens.emplace_back(t); - if ( verbose ) { - printf("%i, ", t); - fflush(stdout); - } - } - if ( verbose ) { - printf("]\n"); - fflush(stdout); } + if ( verbose ) print_tokens( output_tokens ); } const auto t_main_end = ggml_time_us(); diff --git a/vall_e.cpp/vall_e.h b/vall_e.cpp/vall_e.h index e31949d..fcc4f6b 100644 --- a/vall_e.cpp/vall_e.h +++ b/vall_e.cpp/vall_e.h @@ -101,10 +101,18 @@ struct io_map_t { ggml_context* ctx = NULL; }; +struct score_t { + int32_t idx; + float value; + + bool operator<( const score_t& that ) const { return this->value < that.value; } +}; + // helper tensor functions std::vector VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ); ggml_tensor* VALL_E_API view_2d_tensor( ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim = 0 ); // cringe method to keep in my pocket ggml_tensor* VALL_E_API view_2d_tensor( ggml_context* ctx, ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim = 0 ); +void VALL_E_API print_tokens( const std::vector& tokens ); std::vector> VALL_E_API map_embeddings( const std::vector& tokens, int n_embd, const float* embds ); std::vector> VALL_E_API sum_embeddings( const std::vector>& input, int n_embd, int rvq_l, const float** embds, int mode = EMBEDDING_MODE_PROM ); @@ -125,8 +133,4 @@ std::vector VALL_E_API decode_audio( struct encodec_context* ectx, const const io_t& VALL_E_API vall_e_inputs_map_get_embeddings( io_map_t& inputs_map, const std::string& name ); const float* VALL_E_API vall_e_inputs_map_get_embeddings_p( io_map_t& inputs_map, const std::string& name ); int32_t VALL_E_API vall_e_inputs_map_get_classifier_idx( io_map_t& inputs_map, const std::string& name ); -void VALL_E_API vall_e_inputs_map_init( io_map_t&, llama_model* model ); - -struct ggml_tensor * VALL_E_API vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ); -struct ggml_tensor * VALL_E_API vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ); -struct ggml_tensor * VALL_E_API vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ); \ No newline at end of file +void VALL_E_API vall_e_inputs_map_init( io_map_t&, llama_model* model ); \ No newline at end of file