diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp index db539a3..5f72148 100644 --- a/vall_e.cpp/vall_e.cpp +++ b/vall_e.cpp/vall_e.cpp @@ -89,28 +89,7 @@ std::vector read_2d_tensor( struct ggml_tensor* tensor ) { return res; } -/* -ggml_tensor* view_2d_tensor( struct ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim ) { - // to-do: implement other dim - if ( start < 0 ) start = tensor->ne[1] + start; - if ( end < 0 ) end = tensor->ne[1] + end; - - ggml_tensor* res = new ggml_tensor(); - memcpy( res, tensor, sizeof(ggml_tensor) ); - res->op = GGML_OP_VIEW; - res->src[0] = tensor; - - res->data += res->nb[1] * start; - res->ne[1] = end - start; - - for (int i = 2; i < GGML_MAX_DIMS; i++) { - res->nb[i] = res->nb[i - 1] * res->ne[i - 1]; - } - - return res; -} -*/ ggml_tensor* view_2d_tensor( struct ggml_context* ctx, struct ggml_tensor* tensor, int32_t start, int32_t end, int32_t dim ) { // to-do: implement other dim if ( start < 0 ) start = tensor->ne[1] + start; @@ -601,13 +580,14 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i int32_t seq_len = n_outputs; int32_t top_k = 0; float top_p = 1.0; - float temperature = 1.0f; - float cfg_strength = 2.5f; + float temperature = 1.5f; + float cfg_strength = 3.0f; float start_noise = 0.0f; float end_noise = 1.0f; bool annealed_sampling = true; bool remasking = true; float cfg_rescale = 0.75f; + bool entropy_scoring = true; // fill with masked tokens output_tokens.clear(); @@ -621,7 +601,7 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i llama_batch null_batch = llama_batch_init( ctx->params.ctx_size, ctx->io_map->n_embd, ctx->params.ctx_size ); // token scores to reference for masking - std::vector scores(n_outputs, 1.0); + std::vector scores(n_outputs, entropy_scoring ? 0.0 : 1.0); // do one step on many tokens for ( auto step = 0; step < steps; ++step ) { @@ -635,7 +615,7 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i float sampling_cfg_strength = annealed_sampling ? timestep * cfg_strength : cfg_strength; float noise_p = cos( timestep * PI * 0.5f ); - float remask_p = remasking ? 0.5f / steps : 0.0f; + float remask_p = remasking ? 1.0f / (steps * 2.0f) : 0.0f; int32_t n_masked_tokens = (noise_p + remask_p) * seq_len; if ( n_masked_tokens < 1 ) { @@ -651,7 +631,9 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i std::vector sorted_scores( n_outputs ); for ( auto i = 0; i < n_outputs; ++i ) sorted_scores[i] = { i, scores[i] }; std::sort(sorted_scores.begin(), sorted_scores.end()); - // std::reverse(sorted_scores.begin(), sorted_scores.end()); + if ( entropy_scoring) { + std::reverse(sorted_scores.begin(), sorted_scores.end()); + } // and top-k pick the worst scores for ( auto i = 0; i < n_masked_tokens; ++i ) { @@ -669,8 +651,8 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i inputs.resp[0] = output_tokens; fill_batch( batch, inputs, *ctx->io_map, mode ); // update null batch - null_input.resp[0] = output_tokens; null_batch.n_tokens = 0; + null_input.resp[0] = output_tokens; fill_batch( null_batch, inputs, *ctx->io_map, mode ); // cfg decode @@ -707,7 +689,7 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i for ( auto idx = 0; idx < n_outputs; ++idx ) { // skip if not masked if ( !is_masked[idx] ) { - scores[idx] = 0.0; + scores[idx] = entropy_scoring ? 0.0 : 1.0; continue; } @@ -716,6 +698,7 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i // perform softmax before modifying logits std::vector softmaxed = soft_max( n_vocab, logit ); + int32_t t_u = std::distance( softmaxed.begin(), std::max_element(softmaxed.begin(), softmaxed.end()) ); std::vector summed(n_vocab); for (int i = 0; i < n_vocab; i++) { @@ -739,19 +722,16 @@ std::vector generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i // store token if it was masked output_tokens[idx] = t; // update score if it was masked - - // this is actually wrong - scores[idx] = 1.0 - softmaxed[t]; // invert so we pick the worst tokens later - - // this seems to work better - /* - float entropy = 0.f; - for (int v = 0; v < n_vocab; ++v ) { - float p = softmaxed[v]; - if (p > 0) entropy -= p * std::log(p + 1e-9); + if ( entropy_scoring ) { + float entropy = 0.f; + for (int v = 0; v < n_vocab; ++v ) { + float p = softmaxed[v]; + if (p > 0) entropy -= p * std::log(p + 1e-9); + } + scores[idx] = entropy / std::log(n_vocab); // normalize [0–1] + } else { + scores[idx] = softmaxed[t_u]; // invert so we pick the worst tokens later } - scores[idx] = entropy / std::log(n_vocab); // normalize [0–1] - */ } llama_sampler_free(smpl);