temp fix for vall_e.cpp demask scoring regression

2025-04-05 11:04:26 -05:00 · 2025-04-05 11:04:26 -05:00 · 4a909ceff8
commit 4a909ceff8
parent 44260f7445
2 changed files with 12 additions and 3 deletions
--- a/vall_e.cpp/README.md
+++ b/vall_e.cpp/README.md
@ -23,8 +23,6 @@ Run `make`.

 ## To-Do

-* [ ] fix regressions that appeared for whatever reason
-	* it seems to be related to the demasking step, as low steps = fine, more steps = bad......
 * [x] converted model to GGUF
 	* [x] convert it without modifying any of the existing code, as the tokenizer requires some care
 * [x] basic framework
@ -42,6 +40,7 @@ Run `make`.
 	* [x] `AR` sampling
 * [x] working `NAR-len` output
 	* [x] `NAR-len` sampling
+	* [ ] proper scoring
 * [x] working `NAR` output
 	* [x] `NAR` sampling
 * [x] decode audio to disk
--- a/vall_e.cpp/vall_e.cpp
+++ b/vall_e.cpp/vall_e.cpp
@ -748,7 +748,17 @@ std::vector<token_t> generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i
 				// store token if it was masked
 				output_tokens[idx] = t;
 				// update score if it was masked
-				scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later
+
+				// this is actually wrong
+				// scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later
+
+				// this seems to work better
+				float entropy = 0.f;
+				for (int v = 0; v < n_vocab; ++v ) {
+					float p = softmaxed[v];
+					if (p > 0) entropy -= p * std::log(p + 1e-9);
+				}
+				scores[idx] = entropy / std::log(n_vocab); // normalize [0–1]
 			}

 			llama_sampler_free(smpl);