more fixes...

This commit is contained in:
mrq 2024-12-23 21:56:19 -06:00
parent 681ef45cbc
commit 3d63349d96
3 changed files with 91 additions and 96 deletions

View File

@ -14,9 +14,9 @@ Run `make`.
### Required Modifications ### Required Modifications
[`encodec.cpp`](https://github.com/e-c-k-e-r/encodec.cpp) requires updating its GGML copy to the latest version, which requires a few lines to get the CPU backend working. [`encodec.cpp`](https://github.com/PABannier/encodec.cpp) requires updating its GGML copy to the latest version, which requires a few lines to get the CPU backend working (per my [fork](https://github.com/e-c-k-e-r/encodec.cpp)).
[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) only possible modification needs to ensure that a non-causal attention mask is used; everything necessary can be hacked together with clever tricks. [`llama.cpp`](https://github.com/ggerganov/llama.cpp) only possible modification needs to ensure that a non-causal attention mask is used; everything necessary can be hacked together with clever tricks.
## To-Do ## To-Do

View File

@ -87,17 +87,13 @@ ggml_tensor* VALL_E_API view_2d_tensor( struct ggml_context* ctx, struct ggml_te
return res; return res;
} }
void VALL_E_API print_tokens( const std::vector<llama_token>& tokens ) {
struct ggml_tensor * VALL_E_API vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) { printf("[");
return userdata.prom_embds[idx]; for ( auto i = 0; i < tokens.size(); ++i ) {
printf("%i%s", tokens[i], i + 1 < tokens.size() ? ", " : "");
}
printf("]\n");
} }
struct ggml_tensor * VALL_E_API vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
return userdata.resp_embds[idx];
}
struct ggml_tensor * VALL_E_API vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
return userdata.aux_embds[idx];
}
const io_t& VALL_E_API vall_e_inputs_map_get( io_map_t& io_map, const std::string& name ) { const io_t& VALL_E_API vall_e_inputs_map_get( io_map_t& io_map, const std::string& name ) {
return io_map.io[name]; return io_map.io[name];
@ -139,32 +135,32 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) {
io_map.io[entry.name].head = entry.head_idx < 0 ? NULL : userdata.heads[entry.head_idx]; io_map.io[entry.name].head = entry.head_idx < 0 ? NULL : userdata.heads[entry.head_idx];
} }
io_map.io["text"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 0)); io_map.io["text"].embds = read_2d_tensor(userdata.aux_embds[0]);
io_map.io["rvq_l"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 1)); io_map.io["rvq_l"].embds = read_2d_tensor(userdata.aux_embds[1]);
io_map.io["lang"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 2)); io_map.io["lang"].embds = read_2d_tensor(userdata.aux_embds[2]);
io_map.io["task"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 3)); io_map.io["task"].embds = read_2d_tensor(userdata.aux_embds[3]);
io_map.io["len"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 4)); io_map.io["len"].embds = read_2d_tensor(userdata.aux_embds[4]);
io_map.io["tone"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 5)); io_map.io["tone"].embds = read_2d_tensor(userdata.aux_embds[5]);
io_map.io["sep"].embds = read_2d_tensor(vall_e_get_aux_embds(userdata, 6)); io_map.io["sep"].embds = read_2d_tensor(userdata.aux_embds[6]);
io_map.io["prom|0"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 0)); io_map.io["prom|0"].embds = read_2d_tensor(userdata.prom_embds[0]);
io_map.io["prom|1"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 1)); io_map.io["prom|1"].embds = read_2d_tensor(userdata.prom_embds[1]);
io_map.io["prom|2"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 2)); io_map.io["prom|2"].embds = read_2d_tensor(userdata.prom_embds[2]);
io_map.io["prom|3"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 3)); io_map.io["prom|3"].embds = read_2d_tensor(userdata.prom_embds[3]);
io_map.io["prom|4"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 4)); io_map.io["prom|4"].embds = read_2d_tensor(userdata.prom_embds[4]);
io_map.io["prom|5"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 5)); io_map.io["prom|5"].embds = read_2d_tensor(userdata.prom_embds[5]);
io_map.io["prom|6"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 6)); io_map.io["prom|6"].embds = read_2d_tensor(userdata.prom_embds[6]);
io_map.io["prom|7"].embds = read_2d_tensor(vall_e_get_prom_embds(userdata, 7)); io_map.io["prom|7"].embds = read_2d_tensor(userdata.prom_embds[7]);
io_map.io["resps|AR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 0)); io_map.io["resps|AR:0:0"].embds = read_2d_tensor(userdata.resp_embds[0]);
io_map.io["resps|NAR:0:1"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 1)); io_map.io["resps|NAR:0:1"].embds = read_2d_tensor(userdata.resp_embds[1]);
io_map.io["resps|NAR:1:2"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 2)); io_map.io["resps|NAR:1:2"].embds = read_2d_tensor(userdata.resp_embds[2]);
io_map.io["resps|NAR:2:3"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 3)); io_map.io["resps|NAR:2:3"].embds = read_2d_tensor(userdata.resp_embds[3]);
io_map.io["resps|NAR:3:4"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 4)); io_map.io["resps|NAR:3:4"].embds = read_2d_tensor(userdata.resp_embds[4]);
io_map.io["resps|NAR:4:5"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 5)); io_map.io["resps|NAR:4:5"].embds = read_2d_tensor(userdata.resp_embds[5]);
io_map.io["resps|NAR:5:6"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 6)); io_map.io["resps|NAR:5:6"].embds = read_2d_tensor(userdata.resp_embds[6]);
io_map.io["resps|NAR:6:7"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 7)); io_map.io["resps|NAR:6:7"].embds = read_2d_tensor(userdata.resp_embds[7]);
io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 8)); io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(userdata.resp_embds[8]);
/* /*
@ -505,16 +501,12 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
// if INFERENCE_MODE_AR || INFERENCE_MODE_LEN // if INFERENCE_MODE_AR || INFERENCE_MODE_LEN
if ( causal ) { if ( causal ) {
output_tokens.reserve(max_tokens); output_tokens.reserve(max_tokens);
if ( verbose ) {
printf("[");
fflush(stdout);
}
while ( output_tokens.size() < max_tokens ) { while ( output_tokens.size() < max_tokens ) {
if ( llama_decode(ctx, batch) ) { if ( llama_decode(ctx, batch) ) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens; return output_tokens;
} }
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx); // necessary for many reasons
// sample token // sample token
auto t = llama_sampler_sample(smpl, ctx, -1); auto t = llama_sampler_sample(smpl, ctx, -1);
@ -528,21 +520,15 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
output_tokens.emplace_back(t); output_tokens.emplace_back(t);
// update batch with token // update batch with token
batch_add( batch, t, io_map.n_embd, embds, output_tokens.size(), true ); batch_add( batch, t, io_map.n_embd, embds, output_tokens.size(), true );
if ( verbose ) {
printf("%i, ", t); if ( verbose ) print_tokens( output_tokens );
fflush(stdout);
}
}
if ( verbose ) {
printf("]\n");
fflush(stdout);
} }
} else if ( mode == INFERENCE_MODE_NAR_DEMASK ) { } else if ( mode == INFERENCE_MODE_NAR_DEMASK ) {
// to-do: assert n_outputs == input.resp[rvq_l-1].size() // to-do: assert n_outputs == input.resp[rvq_l-1].size()
const llama_token MASK_TOKEN = 1024; // token value for masking const llama_token MASK_TOKEN = 1024; // token value for masking
const float PI = 3.141592653589793f; const float PI = 3.141592653589793f;
// to-do: derive from sampling arguments // to-do: derive from sampling arguments
int32_t steps = 30; // number of demasking steps int32_t steps = 10; // number of demasking steps
int32_t seq_len = n_outputs; int32_t seq_len = n_outputs;
float temperature = 1.5f; float temperature = 1.5f;
float cfg_strength = 2.5f; float cfg_strength = 2.5f;
@ -563,25 +549,37 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
// do one step on many tokens // do one step on many tokens
for ( auto step = 0; step < steps; ++step ) { for ( auto step = 0; step < steps; ++step ) {
if ( verbose ) { float timestep = ((float)step) / steps; // to-do: align with torch.linspace
printf("[%i/%i] [", step, steps);
fflush(stdout);
}
float timestep = (step+1) / steps; // to-do: align with torch.linspace
float annealing = 1.0f - timestep; float annealing = 1.0f - timestep;
float noise_p = cos( timestep * PI * 0.5f );
float remask_p = 0.5f / steps;
int32_t n_masked_tokens = std::min(int(noise_p * seq_len), 1);
float sampling_temperature = temperature * annealing; float sampling_temperature = temperature * annealing;
float sampling_cfg_strength = timestep * cfg_strength; float sampling_cfg_strength = timestep * cfg_strength;
std::vector<bool> is_masked(n_outputs, false); float noise_p = cos( timestep * PI * 0.5f );
std::vector<int32_t> masked_indices; float remask_p = 0.0f; // 0.5f / steps;
masked_indices.reserve(n_masked_tokens);
std::vector<float> sorted = scores; int32_t n_masked_tokens = (noise_p + remask_p) * seq_len;
if ( n_masked_tokens < 1 ) {
n_masked_tokens = 1;
}
if ( n_masked_tokens > n_outputs ) {
n_masked_tokens = n_outputs;
}
// sort previous scores
std::vector<score_t> sorted( n_outputs );
for ( auto i = 0; i < n_outputs; ++i ) {
sorted[i] = { i, scores[i] };
}
std::sort(sorted.begin(), sorted.end()); std::sort(sorted.begin(), sorted.end());
masked_indices.insert( masked_indices.end(), sorted.begin(), sorted.begin() + n_masked_tokens ); // and top-k pick the worst scores
std::vector<int32_t> masked_indices( n_masked_tokens );
for ( auto i = 0; i < n_masked_tokens; ++i ) {
masked_indices[i] = sorted[i].idx;
}
std::vector<bool> is_masked(n_outputs, false);
// mask off tokens // mask off tokens
for ( auto& idx : masked_indices ) { for ( auto& idx : masked_indices ) {
@ -592,6 +590,8 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
is_masked[i] = output_tokens[i] == MASK_TOKEN; is_masked[i] = output_tokens[i] == MASK_TOKEN;
} }
if ( verbose ) print_tokens( output_tokens );
// update batch // update batch
// to-do: only update the embeddings instead // to-do: only update the embeddings instead
batch.n_tokens = 0; batch.n_tokens = 0;
@ -602,14 +602,12 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
null_batch.n_tokens = 0; null_batch.n_tokens = 0;
fill_batch( null_batch, input, io_map, mode ); fill_batch( null_batch, input, io_map, mode );
// to-do: update sampling temperature
// cfg decode // cfg decode
if ( llama_decode(ctx, null_batch) ) { if ( llama_decode(ctx, null_batch) ) {
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens; return output_tokens;
} }
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx); // necessary for many reasons
// copy null probabilities // copy null probabilities
std::vector<float> null_logits(n_outputs * n_vocab, -INFINITY); std::vector<float> null_logits(n_outputs * n_vocab, -INFINITY);
// to-do: copy once // to-do: copy once
@ -622,7 +620,17 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens; return output_tokens;
} }
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx); // necessary for many reasons
auto sparams = llama_sampler_chain_default_params();
sparams.no_perf = false;
llama_sampler * smpl = llama_sampler_chain_init(sparams);
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(0));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(1.0, 1));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (sampling_temperature));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (1130));
// to-do: figure out why all logits are the same for each token...... // to-do: figure out why all logits are the same for each token......
// "reverse" iterate from backwards indexing // "reverse" iterate from backwards indexing
for ( auto idx = 0; idx < n_outputs; ++idx ) { for ( auto idx = 0; idx < n_outputs; ++idx ) {
@ -645,18 +653,12 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
// store token if it was masked // store token if it was masked
output_tokens[idx] = t; output_tokens[idx] = t;
// update score if it was masked // update score if it was masked
scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later scores[idx] = softmaxed[t]; // invert so we pick the worst tokens later
if ( verbose ) {
printf("%i, ", t);
fflush(stdout);
}
}
if ( verbose ) {
printf("\n");
fflush(stdout);
} }
llama_sampler_free(smpl);
if ( verbose ) print_tokens( output_tokens );
} }
} else if ( mode == INFERENCE_MODE_NAR ) { } else if ( mode == INFERENCE_MODE_NAR ) {
// to-do: assert n_outputs == input.resp[rvq_l-1].size() // to-do: assert n_outputs == input.resp[rvq_l-1].size()
@ -666,28 +668,17 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1); fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
return output_tokens; return output_tokens;
} }
llama_kv_cache_clear(ctx); llama_kv_cache_clear(ctx); // necessary for many reasons
// to-do: figure out why all logits are the same for each token...... // to-do: figure out why all logits are the same for each token......
// "reverse" iterate from backwards indexing // "reverse" iterate from backwards indexing
if ( verbose ) {
printf("[");
fflush(stdout);
}
for ( auto idx = 0; idx < n_outputs; ++idx ) { for ( auto idx = 0; idx < n_outputs; ++idx ) {
// sample ith token // sample ith token
auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx); auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx);
// store token // store token
output_tokens.emplace_back(t); output_tokens.emplace_back(t);
if ( verbose ) {
printf("%i, ", t);
fflush(stdout);
}
}
if ( verbose ) {
printf("]\n");
fflush(stdout);
} }
if ( verbose ) print_tokens( output_tokens );
} }
const auto t_main_end = ggml_time_us(); const auto t_main_end = ggml_time_us();

View File

@ -101,10 +101,18 @@ struct io_map_t {
ggml_context* ctx = NULL; ggml_context* ctx = NULL;
}; };
struct score_t {
int32_t idx;
float value;
bool operator<( const score_t& that ) const { return this->value < that.value; }
};
// helper tensor functions // helper tensor functions
std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ); std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor );
ggml_tensor* VALL_E_API view_2d_tensor( ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim = 0 ); // cringe method to keep in my pocket ggml_tensor* VALL_E_API view_2d_tensor( ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim = 0 ); // cringe method to keep in my pocket
ggml_tensor* VALL_E_API view_2d_tensor( ggml_context* ctx, ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim = 0 ); ggml_tensor* VALL_E_API view_2d_tensor( ggml_context* ctx, ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim = 0 );
void VALL_E_API print_tokens( const std::vector<llama_token>& tokens );
std::vector<std::vector<float>> VALL_E_API map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds ); std::vector<std::vector<float>> VALL_E_API map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds );
std::vector<std::vector<float>> VALL_E_API sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode = EMBEDDING_MODE_PROM ); std::vector<std::vector<float>> VALL_E_API sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode = EMBEDDING_MODE_PROM );
@ -125,8 +133,4 @@ std::vector<float> VALL_E_API decode_audio( struct encodec_context* ectx, const
const io_t& VALL_E_API vall_e_inputs_map_get_embeddings( io_map_t& inputs_map, const std::string& name ); const io_t& VALL_E_API vall_e_inputs_map_get_embeddings( io_map_t& inputs_map, const std::string& name );
const float* VALL_E_API vall_e_inputs_map_get_embeddings_p( io_map_t& inputs_map, const std::string& name ); const float* VALL_E_API vall_e_inputs_map_get_embeddings_p( io_map_t& inputs_map, const std::string& name );
int32_t VALL_E_API vall_e_inputs_map_get_classifier_idx( io_map_t& inputs_map, const std::string& name ); int32_t VALL_E_API vall_e_inputs_map_get_classifier_idx( io_map_t& inputs_map, const std::string& name );
void VALL_E_API vall_e_inputs_map_init( io_map_t&, llama_model* model ); void VALL_E_API vall_e_inputs_map_init( io_map_t&, llama_model* model );
struct ggml_tensor * VALL_E_API vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx );
struct ggml_tensor * VALL_E_API vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx );
struct ggml_tensor * VALL_E_API vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx );