diff --git a/vall_e.cpp/README.md b/vall_e.cpp/README.md index a490951..673d5ea 100644 --- a/vall_e.cpp/README.md +++ b/vall_e.cpp/README.md @@ -15,17 +15,8 @@ Run `make`. ### Required Modifications [`encodec.cpp`](https://github.com/e-c-k-e-r/encodec.cpp) requires updating its GGML copy to the latest version, which requires a few lines to get the CPU backend working. -[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) *might* not require any modifications, but: -* `llm.build_vall_e` can mostly copy `llm.build_llama`, but with: - * `KQ_mask = build_inp_KQ_mask( lctx.cparams.causal_attn )` - * a unified output head (pain) - * OR adjusting the `model.output` to the correct classifier head (better option) - * OR slicing that tensor with the right range (`ggml_view_2d` confuses me) - * both require also require `*const_cast(&ctx->model.hparams.n_vocab) = output->ne[1];` because the logits are tied to `n_vocab` -* commenting out `GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());` because grabbing embeddings/classifiers require using `bid` to trick it thinking it's part of a layer -* some helper functions to retrieve the embeddings tensor from the model -* some helper functions to set the target classifier head -* some fix for `GGML_ASSERT(mask->ne[0] == a->ne[0])` when using a non-causal attention mask (or I can test on the model that had a causal NAR......) + +[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) *might* not require any modifications, but implementing `LLM_ARCH_VALL_E` requires some surgery. ## To-Do @@ -46,11 +37,11 @@ Run `make`. * [x] `AR` sampling * currently need a model that didn't regress with the `AR:0:0` output * [ ] working `NAR-len` output - * [ ] `NAR-len` sampling - * currently cannot inference with non-causal_attn + * [x] `NAR-len` sampling + * need to assert that a non-causal mask is used * [ ] working `NAR` output * [x] `NAR` sampling - * currently cannot inference with non-causal_attn + * need to assert that a non-causal mask is used * [x] decode audio to disk * [ ] a functional CLI * [ ] actually make it work diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp index 356e06f..6d752c1 100644 --- a/vall_e.cpp/vall_e.cpp +++ b/vall_e.cpp/vall_e.cpp @@ -1,14 +1,11 @@ #define DR_WAV_IMPLEMENTATION #include "vall_e.h" - - -#define LLAMA_CPP_EXTENDED 1 // whether the underlying llama.cpp has some extra functions -#define LLAMA_CPP_USE_VALL_E_ARCH 1 // whether the underlying llama.cpp is to use the VALL_E arch (or using LLAMA arch) - -#if !LLAMA_CPP_EXTENDED - #include "_llama.h" // cringe hotfix but I have to do this until llama.cpp's API exposes the tok_embd -#endif +#include +#include +#include +#include +#include ranges_t io_ranges[] = { { "text", 0, 256, 9, }, @@ -39,7 +36,7 @@ ranges_t io_ranges[] = { { "resps|NAR:0: 16677, 17702, 8,0", }, }; -std::vector read_2d_tensor( struct ggml_tensor* tensor ) { +std::vector VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ) { size_t size = tensor->ne[0] * tensor->ne[1]; std::vector res( size ); @@ -55,29 +52,29 @@ std::vector read_2d_tensor( struct ggml_tensor* tensor ) { } -struct ggml_tensor * vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) { +struct ggml_tensor * VALL_E_API vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) { return userdata.prom_embds[idx]; } -struct ggml_tensor * vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) { +struct ggml_tensor * VALL_E_API vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) { return userdata.resp_embds[idx]; } -struct ggml_tensor * vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) { +struct ggml_tensor * VALL_E_API vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) { return userdata.aux_embds[idx]; } -const embeddings_t& vall_e_inputs_map_get_embeddings( inputs_map_t& inputs_map, const std::string& name ) { +const embeddings_t& VALL_E_API vall_e_inputs_map_get_embeddings( inputs_map_t& inputs_map, const std::string& name ) { return inputs_map.embds[name]; } -const float* vall_e_inputs_map_get_embeddings_p( inputs_map_t& inputs_map, const std::string& name ) { +const float* VALL_E_API vall_e_inputs_map_get_embeddings_p( inputs_map_t& inputs_map, const std::string& name ) { return inputs_map.embds[name].embds.data(); } -int32_t vall_e_inputs_map_get_classifier_idx( inputs_map_t& inputs_map, const std::string& name ) { +int32_t VALL_E_API vall_e_inputs_map_get_classifier_idx( inputs_map_t& inputs_map, const std::string& name ) { return inputs_map.embds[name].range.classifier_idx; } -void vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) { +void VALL_E_API vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) { auto n_embd = llama_n_embd( model ); auto n_vocab = llama_n_vocab( model ); @@ -146,7 +143,7 @@ void vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) { } // maps embeddings easily -std::vector> map_embeddings( const std::vector& tokens, int n_embd, const float* embds ) { +std::vector> VALL_E_API map_embeddings( const std::vector& tokens, int n_embd, const float* embds ) { std::vector> embedded( tokens.size() ); for ( auto i = 0; i < tokens.size(); ++i ) { embedded[i].insert( embedded[i].end(), embds + (tokens[i] * n_embd), embds + ((tokens[i]+1) * n_embd) ); @@ -156,7 +153,7 @@ std::vector> map_embeddings( const std::vector& // handles adding either a token OR the embedding of that token into the batch // this really, really helps avoid needing to abuse the tokenizer -void batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector & seq_ids ) { +void VALL_E_API batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector & seq_ids ) { GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); // insert raw embedding instead @@ -181,7 +178,7 @@ void batch_add( llama_batch& batch, llama_token id, int n_embd, const float* emb batch.n_tokens++; } // reads a waveform from disk -bool read_wav_from_disk(std::string in_path, std::vector & audio_arr) { +bool VALL_E_API read_wav_from_disk(std::string in_path, std::vector & audio_arr) { uint32_t channels; uint32_t sample_rate; drwav_uint64 total_frame_count; @@ -209,7 +206,7 @@ bool read_wav_from_disk(std::string in_path, std::vector & audio_arr) { return true; } // writes a waveform to disk -void write_wav_on_disk(std::vector & audio_arr, std::string dest_path) { +void VALL_E_API write_wav_on_disk(std::vector & audio_arr, std::string dest_path) { drwav_data_format format; format.bitsPerSample = 32; format.sampleRate = 24000; @@ -225,7 +222,7 @@ void write_wav_on_disk(std::vector & audio_arr, std::string dest_path) { fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames); } // reads a waveform from disk then encodes it -std::vector> encode_audio_from_disk( struct encodec_context* ectx, const std::string& path ) { +std::vector> VALL_E_API encode_audio_from_disk( struct encodec_context* ectx, const std::string& path ) { // read audio from disk std::vector wavform; @@ -258,7 +255,7 @@ std::vector> encode_audio_from_disk( struct encodec_context return codes_2ds; } // decodes a 2D codebook into a waveform -std::vector decode_audio( struct encodec_context* ectx, const std::vector>& codes_2d ) { +std::vector VALL_E_API decode_audio( struct encodec_context* ectx, const std::vector>& codes_2d ) { int n_codebooks = codes_2d.size(); int n_frames = codes_2d[0].size(); @@ -283,7 +280,7 @@ std::vector decode_audio( struct encodec_context* ectx, const std::vector } // sums embeddings over a 2D "tensor" -std::vector> sum_embeddings( const std::vector>& input, int n_embd, int rvq_l, const float** embds, int mode ) { +std::vector> VALL_E_API sum_embeddings( const std::vector>& input, int n_embd, int rvq_l, const float** embds, int mode ) { std::vector> res( input.size() ); res.resize( input[0].size() ); for ( auto& e : res ) e.resize( n_embd ); @@ -311,7 +308,22 @@ std::vector> sum_embeddings( const std::vector VALL_E_API soft_max( int n_logits, const float* logits ) { + std::vector res( n_logits, 0.0f ); + float denom = 0.0f; + + for ( auto i = 0; i < n_logits; ++i ) { + denom += exp( logits[i] ); + } + // to-do: assert denom != 0.0f + for ( auto i = 0; i < n_logits; ++i ) { + res[i] = logits[i] / denom; + } + + return res; +} + +void VALL_E_API fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, int mode ) { // keeps track of the position for each sequence size_t pos = 0; auto n_embd = inputs_map.n_embd; @@ -382,48 +394,42 @@ void fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, i } // generation code, should handle all modalities easily -std::vector generate( llama_context* ctx, llama_model* model, llama_sampler* smpl, input_t& input, inputs_map_t& inputs_map, int max_tokens, int mode, bool verbose ) { - llama_batch batch = llama_batch_init( 22500, inputs_map.n_embd, 22500 ); - - // Decoding loop - const auto t_main_start = ggml_time_us(); - int n_decode = 0; +std::vector VALL_E_API generate( llama_context* ctx, llama_model* model, llama_sampler* smpl, input_t& input, inputs_map_t& inputs_map, int max_tokens, int mode, bool verbose ) { int rvq_l = input.rvq_l; llama_token stop_token = -1; - + int n_decode = 0; // number of tokens decoded + int n_outputs = 0; // number of output tokens to expect + int n_vocab = 0; + int n_embd = 0; + bool causal = true; // sample autoregressively or not + const float* embds = NULL; // embeddings to map output tokens through + ranges_t range; // I/O range + + // create batch (targetting embeddings instead of tokens) + llama_batch batch = llama_batch_init( CTX_SIZE, inputs_map.n_embd, CTX_SIZE ); fill_batch( batch, input, inputs_map, mode ); - // determine how many logits we need - int n_logits = 0; + // determine how many outputs we need for ( auto i = 0; i < batch.n_tokens; ++i ) { - if ( batch.logits[i] ) ++n_logits; + if ( batch.logits[i] ) ++n_outputs; } - - if ( verbose ) printf("Prompt size: %i | Outputs: %i\n", batch.n_tokens, n_logits); + if ( verbose ) printf("Prompt size: %i | Outputs: %i\n", batch.n_tokens, n_outputs); - // NAR mode, cap at one step - if ( n_logits > 1 ) { - max_tokens = n_logits; - } - - if ( n_logits == 0 ) { + // bail out + if ( n_outputs == 0 ) { fprintf(stderr, "%s : no tokens to decode\n", __func__); return {}; } + causal = n_outputs == 1; - const float* embds = NULL; - ranges_t range; - + // AR mode + std::string embd_name = ""; if ( mode == INFERENCE_MODE_AR ) { - auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "resps|AR:0:0"); - range = embeddings.range; - embds = embeddings.embds.data(); - stop_token = range.end - range.start - 1; - - printf("Generating in %s (%i) mode (%i:%i) (%i)\n", "AR", range.classifier_idx, range.start, range.end, stop_token); + embd_name = "resps|AR:0:0"; + // NAR mode } else if ( mode == INFERENCE_MODE_NAR ) { std::string k_embds[] = { - "resps|NAR:0:0", // invalid + "resps|NAR:0:0", // invalid, should never be picked "resps|NAR:0:1", "resps|NAR:1:2", "resps|NAR:2:3", @@ -432,88 +438,237 @@ std::vector generate( llama_context* ctx, llama_model* model, llama "resps|NAR:5:6", "resps|NAR:6:7", }; - auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, k_embds[rvq_l]); - range = embeddings.range; - embds = embeddings.embds.data(); - - printf("Generating in %s (%i) mode (%i:%i)\n", "NAR", range.classifier_idx, range.start, range.end); + embd_name = k_embds[rvq_l]; + // duration inferencing mode } else if ( mode == INFERENCE_MODE_LEN ) { - auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "len"); - range = embeddings.range; - embds = embeddings.embds.data(); - stop_token = range.end - range.start - 1; - - printf("Generating in %s (%i) mode (%i:%i) (%i)\n", "len", range.classifier_idx, range.start, range.end, stop_token); + embd_name = "len"; + // NAR-len (demasking) inferencing mode } else if ( mode == INFERENCE_MODE_NAR_DEMASK ) { - auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "resps|NAR:0:0"); - range = embeddings.range; - embds = embeddings.embds.data(); - - printf("Generating in %s (%i) mode (%i:%i)\n", "NAR-len", range.classifier_idx, range.start, range.end); + embd_name = "resps|NAR:0:0"; } + auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, embd_name); + range = embeddings.range; + embds = embeddings.embds.data(); + n_embd = embeddings.n_embd; + n_vocab = embeddings.n_vocab; + stop_token = range.end - range.start - 1; + + printf("Generating in %s (%i) mode (%i:%i) (%i)\n", embd_name.c_str(), range.classifier_idx, range.start, range.end, stop_token); + + // update model's output heads / causal mode #if LLAMA_CPP_USE_VALL_E_ARCH auto& userdata = *llama_get_vall_e_userdata( model ); llama_set_output_head( model, userdata.heads[range.classifier_idx] ); #endif - llama_set_causal_attn( ctx, n_logits == 1 ); + llama_set_causal_attn( ctx, causal ); // to-do: fix GGML_ASSERT(mask->ne[0] == a->ne[0]) std::vector output_tokens; - while ( output_tokens.size() < max_tokens ) { - if (llama_decode(ctx, batch)) { + const auto t_main_start = ggml_time_us(); + + // if INFERENCE_MODE_AR || INFERENCE_MODE_LEN + if ( causal ) { + output_tokens.reserve(max_tokens); + if ( verbose ) { + printf("["); + fflush(stdout); + } + while ( output_tokens.size() < max_tokens ) { + if ( llama_decode(ctx, batch) ) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return output_tokens; + } + + // ensures only tokens within our designated range are used + #if !LLAMA_CPP_USE_VALL_E_ARCH + auto* logits = llama_get_logits_ith( ctx, -1 ); + for ( auto i = 0; i < inputs_map.n_vocab; ++i ) { + if ( i < range.start || i >= range.end ) logits[i] = -INFINITY; + } + #endif + // sample token + auto t = llama_sampler_sample(smpl, ctx, -1); + + // is stop token + if ( t == stop_token ) { + break; + } + + // store token + output_tokens.emplace_back(t); + // update batch with token + batch_add( batch, t, inputs_map.n_embd, embds, output_tokens.size(), true ); + if ( verbose ) { + printf("%i, ", t); + fflush(stdout); + } + } + if ( verbose ) { + printf("]\n"); + fflush(stdout); + } + } else if ( mode == INFERENCE_MODE_NAR_DEMASK ) { + // to-do: assert n_outputs == input.resp[rvq_l-1].size() + const llama_token MASK_TOKEN = 1024; // token value for masking + const float PI = 3.141592653589793f; + // to-do: derive from sampling arguments + int32_t steps = 30; // number of demasking steps + int32_t seq_len = n_outputs; + float temperature = 1.5f; + float cfg_strength = 2.5f; + + // fill with masked tokens + output_tokens.clear(); + output_tokens.resize(n_outputs, MASK_TOKEN); + + // for CFG + input_t null_input{}; + null_input.phn = {1, 2}; // + null_input.resp.resize(1); + + llama_batch null_batch = llama_batch_init( CTX_SIZE, inputs_map.n_embd, CTX_SIZE ); + + // token scores to reference for masking + std::vector scores(n_outputs, 1.0); + + // do one step on many tokens + for ( auto step = 0; step < steps; ++step ) { + if ( verbose ) { + printf("[%i/%i] [", step, steps); + fflush(stdout); + } + + float timestep = (step+1) / steps; // to-do: align with torch.linspace + float annealing = 1.0f - timestep; + float noise_p = cos( timestep * PI * 0.5f ); + float remask_p = 0.5f / steps; + int32_t n_masked_tokens = std::min(int(noise_p * seq_len), 1); + float sampling_temperature = temperature * annealing; + float sampling_cfg_strength = timestep * cfg_strength; + + std::vector is_masked(n_outputs, false); + std::vector masked_indices; + masked_indices.reserve(n_masked_tokens); + std::vector sorted = scores; + std::sort(sorted.begin(), sorted.end()); + masked_indices.insert( masked_indices.end(), sorted.begin(), sorted.begin() + n_masked_tokens ); + + // mask off tokens + for ( auto& idx : masked_indices ) { + output_tokens[idx] = MASK_TOKEN; + } + // update token mask + for ( auto i = 0; i < n_outputs; ++i ) { + is_masked[i] = output_tokens[i] == MASK_TOKEN; + } + + // update batch + // to-do: only update the embeddings instead + batch.n_tokens = 0; + input.resp[0] = output_tokens; + fill_batch( batch, input, inputs_map, mode ); + // update null batch + null_input.resp[0] = output_tokens; + null_batch.n_tokens = 0; + fill_batch( null_batch, input, inputs_map, mode ); + + // to-do: update sampling temperature + + // cfg decode + if ( llama_decode(ctx, null_batch) ) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return output_tokens; + } + // copy null probabilities + std::vector null_logits(n_outputs * n_vocab, -INFINITY); + // to-do: copy once + for ( auto idx = 0; idx < n_outputs; ++idx ) { + memcpy( &null_logits[idx * n_vocab], llama_get_logits_ith( ctx, null_batch.n_tokens - n_outputs + idx ), sizeof(float) * n_vocab ); + } + + // decode + if ( llama_decode(ctx, batch) ) { + fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); + return output_tokens; + } + // to-do: figure out why all logits are the same for each token...... + // "reverse" iterate from backwards indexing + for ( auto idx = 0; idx < n_outputs; ++idx ) { + // skip if not masked + if ( !is_masked[idx] ) + continue; + // ensures only tokens within our designated range are used + auto* logits = llama_get_logits_ith( ctx, batch.n_tokens - n_outputs + idx ); + auto* null_logit = &null_logits[idx]; + + #if !LLAMA_CPP_USE_VALL_E_ARCH + for ( auto i = 0; i < inputs_map.n_vocab; ++i ) { + if ( i < range.start || i >= range.end ) logits[i] = -INFINITY; + } + #endif + // perform softmax before modifying logits + std::vector softmaxed = soft_max( n_vocab, logits ); + + // perform CFG sampling + for ( auto i = 0; i < n_vocab; ++i ) { + logits[i] = null_logit[i] + (logits[i] - null_logit[i]) * cfg_strength; + } + // sample ith token + auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx ); + // store token if it was masked + output_tokens[idx] = t; + // update score if it was masked + scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later + if ( verbose ) { + printf("%i, ", t); + fflush(stdout); + } + } + if ( verbose ) { + printf("\n"); + fflush(stdout); + } + } + } else if ( mode == INFERENCE_MODE_NAR ) { + // to-do: assert n_outputs == input.resp[rvq_l-1].size() + output_tokens.reserve(n_outputs); + // do one step on many tokens + if ( llama_decode(ctx, batch) ) { fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); return output_tokens; } - std::vector current_tokens; - // backwards iterate to start from beginning of sequence - for ( auto i = n_logits; i > 0; --i ) { - // filter logits - auto* logits = llama_get_logits_ith( ctx, -i ); - - // ensures only tokens within our designated range are used + // to-do: figure out why all logits are the same for each token...... + // "reverse" iterate from backwards indexing + if ( verbose ) { + printf("["); + fflush(stdout); + } + for ( auto idx = 0; idx < n_outputs; ++idx ) { + // ensures only tokens within our designated range are used #if !LLAMA_CPP_USE_VALL_E_ARCH + auto* logits = llama_get_logits_ith( ctx, batch.n_tokens - n_outputs + idx ); for ( auto i = 0; i < inputs_map.n_vocab; ++i ) { if ( i < range.start || i >= range.end ) logits[i] = -INFINITY; } #endif - // sample the next token - printf("%i: %p\n [", -i, logits ); - for ( auto i = 0; i < 1025; ++i ) { - printf("%f, ", logits[i]); - } - printf("]\n"); - auto t = llama_sampler_sample(smpl, ctx, -i); - //printf("%i: [%i]: %f | %p\n", -i, t, logits[t], logits ); - - // offset back into range - #if !LLAMA_CPP_USE_VALL_E_ARCH - t -= range.start; - #endif - - n_decode += 1; - - // is stop token - if ( t == stop_token ) { - printf("STOPPED\n"); - max_tokens = 0; - break; - } + // sample ith token + auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx); // store token - current_tokens.emplace_back(t); - // update batch with token - batch_add( batch, t, inputs_map.n_embd, embds, output_tokens.size(), true ); + output_tokens.emplace_back(t); + if ( verbose ) { + printf("%i, ", t); + fflush(stdout); + } } - printf("%s: Tokens: [", __func__); - for ( auto& token : current_tokens ) { - printf("%i, ", token); + if ( verbose ) { + printf("]\n"); + fflush(stdout); } - printf("]\n"); - - output_tokens.insert(output_tokens.end(), current_tokens.begin(), current_tokens.end()); } + const auto t_main_end = ggml_time_us(); if ( verbose ) { @@ -535,7 +690,7 @@ std::vector generate( llama_context* ctx, llama_model* model, llama int main( int argc, char** argv ) { // to-do: replace all of this with proper loading code int32_t ngl = 0; - int modality = MODALITY_AR_NAR; + int modality = MODALITY_NAR_LEN; input_t input{}; inputs_map_t inputs_map{}; @@ -632,7 +787,7 @@ int main( int argc, char** argv ) { // NAR-len demasking if ( modality == MODALITY_NAR_LEN ) { // inference len - int len = 0; + int len = 290; if ( !len ) { input.task = "len"; output_tokens = generate( ctx, model, smpl_nar, input, inputs_map, 5, INFERENCE_MODE_LEN ); diff --git a/vall_e.cpp/vall_e.h b/vall_e.cpp/vall_e.h index f376596..a592413 100644 --- a/vall_e.cpp/vall_e.h +++ b/vall_e.cpp/vall_e.h @@ -1,23 +1,24 @@ #pragma once -#include "llama-vocab.h" #include "llama.h" #include "encodec.h" #include "dr_wav.h" -#include -#include -#include #include #include -#include #include -#include // to-do: copy over import/export stuff from engine project (because I don't remember how I set it up in ) #define VALL_E_API +#define LLAMA_CPP_EXTENDED 1 // whether the underlying llama.cpp has some extra functions +#define LLAMA_CPP_USE_VALL_E_ARCH 1 // whether the underlying llama.cpp is to use the VALL_E arch (or using LLAMA arch) + +#if !LLAMA_CPP_EXTENDED + #include "_llama.h" // cringe hotfix but I have to do this until llama.cpp's API exposes the tok_embd +#endif + // to-do: clean up spaghetti enums const int EMBEDDING_MODE_PROM = 0; const int EMBEDDING_MODE_RESP_AR_NAR = 1; @@ -106,6 +107,7 @@ struct inputs_map_t { std::vector VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ); std::vector> VALL_E_API map_embeddings( const std::vector& tokens, int n_embd, const float* embds ); std::vector> VALL_E_API sum_embeddings( const std::vector>& input, int n_embd, int rvq_l, const float** embds, int mode = EMBEDDING_MODE_PROM ); +std::vector VALL_E_API soft_max( int n_logits, const float* logits ); // batch and inferencing void VALL_E_API batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector & seq_ids = {0} );