From 4a909ceff8767e18c5d2d1192cf19a6266eb5af6 Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Sat, 5 Apr 2025 11:04:26 -0500
Subject: [PATCH] temp fix for vall_e.cpp demask scoring regression

---
 vall_e.cpp/README.md  |  3 +--
 vall_e.cpp/vall_e.cpp | 12 +++++++++++-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/vall_e.cpp/README.md b/vall_e.cpp/README.md
index 197d9e3..8344737 100644
--- a/vall_e.cpp/README.md
+++ b/vall_e.cpp/README.md
@@ -23,8 +23,6 @@ Run `make`.
 
 ## To-Do
 
-* [ ] fix regressions that appeared for whatever reason
-	* it seems to be related to the demasking step, as low steps = fine, more steps = bad......
 * [x] converted model to GGUF
 	* [x] convert it without modifying any of the existing code, as the tokenizer requires some care
 * [x] basic framework
@@ -42,6 +40,7 @@ Run `make`.
 	* [x] `AR` sampling
 * [x] working `NAR-len` output
 	* [x] `NAR-len` sampling
+	* [ ] proper scoring
 * [x] working `NAR` output
 	* [x] `NAR` sampling
 * [x] decode audio to disk
diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp
index 5c69bfb..a824544 100644
--- a/vall_e.cpp/vall_e.cpp
+++ b/vall_e.cpp/vall_e.cpp
@@ -748,7 +748,17 @@ std::vector<token_t> generate( vall_e_context_t* ctx, vall_e_inputs_t& inputs, i
 				// store token if it was masked
 				output_tokens[idx] = t;
 				// update score if it was masked
-				scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later
+
+				// this is actually wrong
+				// scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later
+
+				// this seems to work better
+				float entropy = 0.f;
+				for (int v = 0; v < n_vocab; ++v ) {
+					float p = softmaxed[v];
+					if (p > 0) entropy -= p * std::log(p + 1e-9);
+				}
+				scores[idx] = entropy / std::log(n_vocab); // normalize [0–1]
 			}
 
 			llama_sampler_free(smpl);