more work on vall_e.cpp (some more cleanup, NAR-len demasking, but still need to iron out some kinks)

This commit is contained in:
mrq 2024-12-23 17:20:04 -06:00
parent a6945f981d
commit 6ecdb715b6
3 changed files with 279 additions and 131 deletions

View File

@ -15,17 +15,8 @@ Run `make`.
### Required Modifications
[`encodec.cpp`](https://github.com/e-c-k-e-r/encodec.cpp) requires updating its GGML copy to the latest version, which requires a few lines to get the CPU backend working.
[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) *might* not require any modifications, but:
* `llm.build_vall_e` can mostly copy `llm.build_llama`, but with:
* `KQ_mask = build_inp_KQ_mask( lctx.cparams.causal_attn )`
* a unified output head (pain)
* OR adjusting the `model.output` to the correct classifier head (better option)
* OR slicing that tensor with the right range (`ggml_view_2d` confuses me)
* both require also require `*const_cast<uint32_t*>(&ctx->model.hparams.n_vocab) = output->ne[1];` because the logits are tied to `n_vocab`
* commenting out `GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());` because grabbing embeddings/classifiers require using `bid` to trick it thinking it's part of a layer
* some helper functions to retrieve the embeddings tensor from the model
* some helper functions to set the target classifier head
* some fix for `GGML_ASSERT(mask->ne[0] == a->ne[0])` when using a non-causal attention mask (or I can test on the model that had a causal NAR......)
[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) *might* not require any modifications, but implementing `LLM_ARCH_VALL_E` requires some surgery.
## To-Do
@ -46,11 +37,11 @@ Run `make`.
* [x] `AR` sampling
* currently need a model that didn't regress with the `AR:0:0` output
* [ ] working `NAR-len` output
* [ ] `NAR-len` sampling
* currently cannot inference with non-causal_attn
* [x] `NAR-len` sampling
* need to assert that a non-causal mask is used
* [ ] working `NAR` output
* [x] `NAR` sampling
* currently cannot inference with non-causal_attn
* need to assert that a non-causal mask is used
* [x] decode audio to disk
* [ ] a functional CLI
* [ ] actually make it work

View File

@ -1,14 +1,11 @@
#define DR_WAV_IMPLEMENTATION
#include "vall_e.h"
#define LLAMA_CPP_EXTENDED 1 // whether the underlying llama.cpp has some extra functions
#define LLAMA_CPP_USE_VALL_E_ARCH 1 // whether the underlying llama.cpp is to use the VALL_E arch (or using LLAMA arch)
#if !LLAMA_CPP_EXTENDED
#include "_llama.h" // cringe hotfix but I have to do this until llama.cpp's API exposes the tok_embd
#endif
#include <cmath>
#include <cstdio>
#include <cstring>
#include <iostream>
#include <algorithm>
ranges_t io_ranges[] = {
{ "text", 0, 256, 9, },
@ -39,7 +36,7 @@ ranges_t io_ranges[] = {
{ "resps|NAR:0: 16677, 17702, 8,0", },
};
std::vector<float> read_2d_tensor( struct ggml_tensor* tensor ) {
std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ) {
size_t size = tensor->ne[0] * tensor->ne[1];
std::vector<float> res( size );
@ -55,29 +52,29 @@ std::vector<float> read_2d_tensor( struct ggml_tensor* tensor ) {
}
struct ggml_tensor * vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
struct ggml_tensor * VALL_E_API vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
return userdata.prom_embds[idx];
}
struct ggml_tensor * vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
struct ggml_tensor * VALL_E_API vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
return userdata.resp_embds[idx];
}
struct ggml_tensor * vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
struct ggml_tensor * VALL_E_API vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
return userdata.aux_embds[idx];
}
const embeddings_t& vall_e_inputs_map_get_embeddings( inputs_map_t& inputs_map, const std::string& name ) {
const embeddings_t& VALL_E_API vall_e_inputs_map_get_embeddings( inputs_map_t& inputs_map, const std::string& name ) {
return inputs_map.embds[name];
}
const float* vall_e_inputs_map_get_embeddings_p( inputs_map_t& inputs_map, const std::string& name ) {
const float* VALL_E_API vall_e_inputs_map_get_embeddings_p( inputs_map_t& inputs_map, const std::string& name ) {
return inputs_map.embds[name].embds.data();
}
int32_t vall_e_inputs_map_get_classifier_idx( inputs_map_t& inputs_map, const std::string& name ) {
int32_t VALL_E_API vall_e_inputs_map_get_classifier_idx( inputs_map_t& inputs_map, const std::string& name ) {
return inputs_map.embds[name].range.classifier_idx;
}
void vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) {
void VALL_E_API vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) {
auto n_embd = llama_n_embd( model );
auto n_vocab = llama_n_vocab( model );
@ -146,7 +143,7 @@ void vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) {
}
// maps embeddings easily
std::vector<std::vector<float>> map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds ) {
std::vector<std::vector<float>> VALL_E_API map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds ) {
std::vector<std::vector<float>> embedded( tokens.size() );
for ( auto i = 0; i < tokens.size(); ++i ) {
embedded[i].insert( embedded[i].end(), embds + (tokens[i] * n_embd), embds + ((tokens[i]+1) * n_embd) );
@ -156,7 +153,7 @@ std::vector<std::vector<float>> map_embeddings( const std::vector<llama_token>&
// handles adding either a token OR the embedding of that token into the batch
// this really, really helps avoid needing to abuse the tokenizer
void batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector<llama_seq_id> & seq_ids ) {
void VALL_E_API batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector<llama_seq_id> & seq_ids ) {
GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");
// insert raw embedding instead
@ -181,7 +178,7 @@ void batch_add( llama_batch& batch, llama_token id, int n_embd, const float* emb
batch.n_tokens++;
}
// reads a waveform from disk
bool read_wav_from_disk(std::string in_path, std::vector<float> & audio_arr) {
bool VALL_E_API read_wav_from_disk(std::string in_path, std::vector<float> & audio_arr) {
uint32_t channels;
uint32_t sample_rate;
drwav_uint64 total_frame_count;
@ -209,7 +206,7 @@ bool read_wav_from_disk(std::string in_path, std::vector<float> & audio_arr) {
return true;
}
// writes a waveform to disk
void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
void VALL_E_API write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
drwav_data_format format;
format.bitsPerSample = 32;
format.sampleRate = 24000;
@ -225,7 +222,7 @@ void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames);
}
// reads a waveform from disk then encodes it
std::vector<std::vector<int32_t>> encode_audio_from_disk( struct encodec_context* ectx, const std::string& path ) {
std::vector<std::vector<int32_t>> VALL_E_API encode_audio_from_disk( struct encodec_context* ectx, const std::string& path ) {
// read audio from disk
std::vector<float> wavform;
@ -258,7 +255,7 @@ std::vector<std::vector<int32_t>> encode_audio_from_disk( struct encodec_context
return codes_2ds;
}
// decodes a 2D codebook into a waveform
std::vector<float> decode_audio( struct encodec_context* ectx, const std::vector<std::vector<int32_t>>& codes_2d ) {
std::vector<float> VALL_E_API decode_audio( struct encodec_context* ectx, const std::vector<std::vector<int32_t>>& codes_2d ) {
int n_codebooks = codes_2d.size();
int n_frames = codes_2d[0].size();
@ -283,7 +280,7 @@ std::vector<float> decode_audio( struct encodec_context* ectx, const std::vector
}
// sums embeddings over a 2D "tensor"
std::vector<std::vector<float>> sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode ) {
std::vector<std::vector<float>> VALL_E_API sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode ) {
std::vector<std::vector<float>> res( input.size() );
res.resize( input[0].size() );
for ( auto& e : res ) e.resize( n_embd );
@ -311,7 +308,22 @@ std::vector<std::vector<float>> sum_embeddings( const std::vector<std::vector<ll
return res;
}
void fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, int mode ) {
std::vector<float> VALL_E_API soft_max( int n_logits, const float* logits ) {
std::vector<float> res( n_logits, 0.0f );
float denom = 0.0f;
for ( auto i = 0; i < n_logits; ++i ) {
denom += exp( logits[i] );
}
// to-do: assert denom != 0.0f
for ( auto i = 0; i < n_logits; ++i ) {
res[i] = logits[i] / denom;
}
return res;
}
void VALL_E_API fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, int mode ) {
// keeps track of the position for each sequence
size_t pos = 0;
auto n_embd = inputs_map.n_embd;
@ -382,48 +394,42 @@ void fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, i
}
// generation code, should handle all modalities easily
std::vector<llama_token> generate( llama_context* ctx, llama_model* model, llama_sampler* smpl, input_t& input, inputs_map_t& inputs_map, int max_tokens, int mode, bool verbose ) {
llama_batch batch = llama_batch_init( 22500, inputs_map.n_embd, 22500 );
// Decoding loop
const auto t_main_start = ggml_time_us();
int n_decode = 0;
std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* model, llama_sampler* smpl, input_t& input, inputs_map_t& inputs_map, int max_tokens, int mode, bool verbose ) {
int rvq_l = input.rvq_l;
llama_token stop_token = -1;
int n_decode = 0; // number of tokens decoded
int n_outputs = 0; // number of output tokens to expect
int n_vocab = 0;
int n_embd = 0;
bool causal = true; // sample autoregressively or not
const float* embds = NULL; // embeddings to map output tokens through
ranges_t range; // I/O range
// create batch (targetting embeddings instead of tokens)
llama_batch batch = llama_batch_init( CTX_SIZE, inputs_map.n_embd, CTX_SIZE );
fill_batch( batch, input, inputs_map, mode );
// determine how many logits we need
int n_logits = 0;
// determine how many outputs we need
for ( auto i = 0; i < batch.n_tokens; ++i ) {
if ( batch.logits[i] ) ++n_logits;
if ( batch.logits[i] ) ++n_outputs;
}
if ( verbose ) printf("Prompt size: %i | Outputs: %i\n", batch.n_tokens, n_logits);
if ( verbose ) printf("Prompt size: %i | Outputs: %i\n", batch.n_tokens, n_outputs);
// NAR mode, cap at one step
if ( n_logits > 1 ) {
max_tokens = n_logits;
}
if ( n_logits == 0 ) {
// bail out
if ( n_outputs == 0 ) {
fprintf(stderr, "%s : no tokens to decode\n", __func__);
return {};
}
causal = n_outputs == 1;
const float* embds = NULL;
ranges_t range;
// AR mode
std::string embd_name = "";
if ( mode == INFERENCE_MODE_AR ) {
auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "resps|AR:0:0");
range = embeddings.range;
embds = embeddings.embds.data();
stop_token = range.end - range.start - 1;
printf("Generating in %s (%i) mode (%i:%i) (%i)\n", "AR", range.classifier_idx, range.start, range.end, stop_token);
embd_name = "resps|AR:0:0";
// NAR mode
} else if ( mode == INFERENCE_MODE_NAR ) {
std::string k_embds[] = {
"resps|NAR:0:0", // invalid
"resps|NAR:0:0", // invalid, should never be picked
"resps|NAR:0:1",
"resps|NAR:1:2",
"resps|NAR:2:3",
@ -432,88 +438,237 @@ std::vector<llama_token> generate( llama_context* ctx, llama_model* model, llama
"resps|NAR:5:6",
"resps|NAR:6:7",
};
auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, k_embds[rvq_l]);
range = embeddings.range;
embds = embeddings.embds.data();
printf("Generating in %s (%i) mode (%i:%i)\n", "NAR", range.classifier_idx, range.start, range.end);
embd_name = k_embds[rvq_l];
// duration inferencing mode
} else if ( mode == INFERENCE_MODE_LEN ) {
auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "len");
range = embeddings.range;
embds = embeddings.embds.data();
stop_token = range.end - range.start - 1;
printf("Generating in %s (%i) mode (%i:%i) (%i)\n", "len", range.classifier_idx, range.start, range.end, stop_token);
embd_name = "len";
// NAR-len (demasking) inferencing mode
} else if ( mode == INFERENCE_MODE_NAR_DEMASK ) {
auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "resps|NAR:0:0");
range = embeddings.range;
embds = embeddings.embds.data();
printf("Generating in %s (%i) mode (%i:%i)\n", "NAR-len", range.classifier_idx, range.start, range.end);
embd_name = "resps|NAR:0:0";
}
auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, embd_name);
range = embeddings.range;
embds = embeddings.embds.data();
n_embd = embeddings.n_embd;
n_vocab = embeddings.n_vocab;
stop_token = range.end - range.start - 1;
printf("Generating in %s (%i) mode (%i:%i) (%i)\n", embd_name.c_str(), range.classifier_idx, range.start, range.end, stop_token);
// update model's output heads / causal mode
#if LLAMA_CPP_USE_VALL_E_ARCH
auto& userdata = *llama_get_vall_e_userdata( model );
llama_set_output_head( model, userdata.heads[range.classifier_idx] );
#endif
llama_set_causal_attn( ctx, n_logits == 1 );
llama_set_causal_attn( ctx, causal );
// to-do: fix GGML_ASSERT(mask->ne[0] == a->ne[0])
std::vector<llama_token> output_tokens;
while ( output_tokens.size() < max_tokens ) {
if (llama_decode(ctx, batch)) {
const auto t_main_start = ggml_time_us();
// if INFERENCE_MODE_AR || INFERENCE_MODE_LEN
if ( causal ) {
output_tokens.reserve(max_tokens);
if ( verbose ) {
printf("[");
fflush(stdout);
}
while ( output_tokens.size() < max_tokens ) {
if ( llama_decode(ctx, batch) ) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens;
}
// ensures only tokens within our designated range are used
#if !LLAMA_CPP_USE_VALL_E_ARCH
auto* logits = llama_get_logits_ith( ctx, -1 );
for ( auto i = 0; i < inputs_map.n_vocab; ++i ) {
if ( i < range.start || i >= range.end ) logits[i] = -INFINITY;
}
#endif
// sample token
auto t = llama_sampler_sample(smpl, ctx, -1);
// is stop token
if ( t == stop_token ) {
break;
}
// store token
output_tokens.emplace_back(t);
// update batch with token
batch_add( batch, t, inputs_map.n_embd, embds, output_tokens.size(), true );
if ( verbose ) {
printf("%i, ", t);
fflush(stdout);
}
}
if ( verbose ) {
printf("]\n");
fflush(stdout);
}
} else if ( mode == INFERENCE_MODE_NAR_DEMASK ) {
// to-do: assert n_outputs == input.resp[rvq_l-1].size()
const llama_token MASK_TOKEN = 1024; // token value for masking
const float PI = 3.141592653589793f;
// to-do: derive from sampling arguments
int32_t steps = 30; // number of demasking steps
int32_t seq_len = n_outputs;
float temperature = 1.5f;
float cfg_strength = 2.5f;
// fill with masked tokens
output_tokens.clear();
output_tokens.resize(n_outputs, MASK_TOKEN);
// for CFG
input_t null_input{};
null_input.phn = {1, 2}; // <bos></eos>
null_input.resp.resize(1);
llama_batch null_batch = llama_batch_init( CTX_SIZE, inputs_map.n_embd, CTX_SIZE );
// token scores to reference for masking
std::vector<float> scores(n_outputs, 1.0);
// do one step on many tokens
for ( auto step = 0; step < steps; ++step ) {
if ( verbose ) {
printf("[%i/%i] [", step, steps);
fflush(stdout);
}
float timestep = (step+1) / steps; // to-do: align with torch.linspace
float annealing = 1.0f - timestep;
float noise_p = cos( timestep * PI * 0.5f );
float remask_p = 0.5f / steps;
int32_t n_masked_tokens = std::min(int(noise_p * seq_len), 1);
float sampling_temperature = temperature * annealing;
float sampling_cfg_strength = timestep * cfg_strength;
std::vector<bool> is_masked(n_outputs, false);
std::vector<int32_t> masked_indices;
masked_indices.reserve(n_masked_tokens);
std::vector<float> sorted = scores;
std::sort(sorted.begin(), sorted.end());
masked_indices.insert( masked_indices.end(), sorted.begin(), sorted.begin() + n_masked_tokens );
// mask off tokens
for ( auto& idx : masked_indices ) {
output_tokens[idx] = MASK_TOKEN;
}
// update token mask
for ( auto i = 0; i < n_outputs; ++i ) {
is_masked[i] = output_tokens[i] == MASK_TOKEN;
}
// update batch
// to-do: only update the embeddings instead
batch.n_tokens = 0;
input.resp[0] = output_tokens;
fill_batch( batch, input, inputs_map, mode );
// update null batch
null_input.resp[0] = output_tokens;
null_batch.n_tokens = 0;
fill_batch( null_batch, input, inputs_map, mode );
// to-do: update sampling temperature
// cfg decode
if ( llama_decode(ctx, null_batch) ) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens;
}
// copy null probabilities
std::vector<float> null_logits(n_outputs * n_vocab, -INFINITY);
// to-do: copy once
for ( auto idx = 0; idx < n_outputs; ++idx ) {
memcpy( &null_logits[idx * n_vocab], llama_get_logits_ith( ctx, null_batch.n_tokens - n_outputs + idx ), sizeof(float) * n_vocab );
}
// decode
if ( llama_decode(ctx, batch) ) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens;
}
// to-do: figure out why all logits are the same for each token......
// "reverse" iterate from backwards indexing
for ( auto idx = 0; idx < n_outputs; ++idx ) {
// skip if not masked
if ( !is_masked[idx] )
continue;
// ensures only tokens within our designated range are used
auto* logits = llama_get_logits_ith( ctx, batch.n_tokens - n_outputs + idx );
auto* null_logit = &null_logits[idx];
#if !LLAMA_CPP_USE_VALL_E_ARCH
for ( auto i = 0; i < inputs_map.n_vocab; ++i ) {
if ( i < range.start || i >= range.end ) logits[i] = -INFINITY;
}
#endif
// perform softmax before modifying logits
std::vector<float> softmaxed = soft_max( n_vocab, logits );
// perform CFG sampling
for ( auto i = 0; i < n_vocab; ++i ) {
logits[i] = null_logit[i] + (logits[i] - null_logit[i]) * cfg_strength;
}
// sample ith token
auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx );
// store token if it was masked
output_tokens[idx] = t;
// update score if it was masked
scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later
if ( verbose ) {
printf("%i, ", t);
fflush(stdout);
}
}
if ( verbose ) {
printf("\n");
fflush(stdout);
}
}
} else if ( mode == INFERENCE_MODE_NAR ) {
// to-do: assert n_outputs == input.resp[rvq_l-1].size()
output_tokens.reserve(n_outputs);
// do one step on many tokens
if ( llama_decode(ctx, batch) ) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens;
}
std::vector<llama_token> current_tokens;
// backwards iterate to start from beginning of sequence
for ( auto i = n_logits; i > 0; --i ) {
// filter logits
auto* logits = llama_get_logits_ith( ctx, -i );
// ensures only tokens within our designated range are used
// to-do: figure out why all logits are the same for each token......
// "reverse" iterate from backwards indexing
if ( verbose ) {
printf("[");
fflush(stdout);
}
for ( auto idx = 0; idx < n_outputs; ++idx ) {
// ensures only tokens within our designated range are used
#if !LLAMA_CPP_USE_VALL_E_ARCH
auto* logits = llama_get_logits_ith( ctx, batch.n_tokens - n_outputs + idx );
for ( auto i = 0; i < inputs_map.n_vocab; ++i ) {
if ( i < range.start || i >= range.end ) logits[i] = -INFINITY;
}
#endif
// sample the next token
printf("%i: %p\n [", -i, logits );
for ( auto i = 0; i < 1025; ++i ) {
printf("%f, ", logits[i]);
}
printf("]\n");
auto t = llama_sampler_sample(smpl, ctx, -i);
//printf("%i: [%i]: %f | %p\n", -i, t, logits[t], logits );
// offset back into range
#if !LLAMA_CPP_USE_VALL_E_ARCH
t -= range.start;
#endif
n_decode += 1;
// is stop token
if ( t == stop_token ) {
printf("STOPPED\n");
max_tokens = 0;
break;
}
// sample ith token
auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx);
// store token
current_tokens.emplace_back(t);
// update batch with token
batch_add( batch, t, inputs_map.n_embd, embds, output_tokens.size(), true );
output_tokens.emplace_back(t);
if ( verbose ) {
printf("%i, ", t);
fflush(stdout);
}
}
printf("%s: Tokens: [", __func__);
for ( auto& token : current_tokens ) {
printf("%i, ", token);
if ( verbose ) {
printf("]\n");
fflush(stdout);
}
printf("]\n");
output_tokens.insert(output_tokens.end(), current_tokens.begin(), current_tokens.end());
}
const auto t_main_end = ggml_time_us();
if ( verbose ) {
@ -535,7 +690,7 @@ std::vector<llama_token> generate( llama_context* ctx, llama_model* model, llama
int main( int argc, char** argv ) {
// to-do: replace all of this with proper loading code
int32_t ngl = 0;
int modality = MODALITY_AR_NAR;
int modality = MODALITY_NAR_LEN;
input_t input{};
inputs_map_t inputs_map{};
@ -632,7 +787,7 @@ int main( int argc, char** argv ) {
// NAR-len demasking
if ( modality == MODALITY_NAR_LEN ) {
// inference len
int len = 0;
int len = 290;
if ( !len ) {
input.task = "len";
output_tokens = generate( ctx, model, smpl_nar, input, inputs_map, 5, INFERENCE_MODE_LEN );

View File

@ -1,23 +1,24 @@
#pragma once
#include "llama-vocab.h"
#include "llama.h"
#include "encodec.h"
#include "dr_wav.h"
#include <cmath>
#include <cstdio>
#include <cstring>
#include <string>
#include <vector>
#include <array>
#include <unordered_map>
#include <iostream>
// to-do: copy over import/export stuff from engine project (because I don't remember how I set it up in <uf/config.h>)
#define VALL_E_API
#define LLAMA_CPP_EXTENDED 1 // whether the underlying llama.cpp has some extra functions
#define LLAMA_CPP_USE_VALL_E_ARCH 1 // whether the underlying llama.cpp is to use the VALL_E arch (or using LLAMA arch)
#if !LLAMA_CPP_EXTENDED
#include "_llama.h" // cringe hotfix but I have to do this until llama.cpp's API exposes the tok_embd
#endif
// to-do: clean up spaghetti enums
const int EMBEDDING_MODE_PROM = 0;
const int EMBEDDING_MODE_RESP_AR_NAR = 1;
@ -106,6 +107,7 @@ struct inputs_map_t {
std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor );
std::vector<std::vector<float>> VALL_E_API map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds );
std::vector<std::vector<float>> VALL_E_API sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode = EMBEDDING_MODE_PROM );
std::vector<float> VALL_E_API soft_max( int n_logits, const float* logits );
// batch and inferencing
void VALL_E_API batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector<llama_seq_id> & seq_ids = {0} );