From 681ef45cbc180ca4bba877334d3876d1121eb13e Mon Sep 17 00:00:00 2001
From: mrq <mrq@ecker.tech>
Date: Mon, 23 Dec 2024 21:16:24 -0600
Subject: [PATCH] nvm fixed

---
 vall_e.cpp/README.md  |  3 +--
 vall_e.cpp/vall_e.cpp | 26 +++++++++++---------------
 2 files changed, 12 insertions(+), 17 deletions(-)
diff --git a/vall_e.cpp/README.md b/vall_e.cpp/README.md
index 916df18..c8158d3 100644
--- a/vall_e.cpp/README.md
+++ b/vall_e.cpp/README.md
@@ -2,7 +2,7 @@
 
 This is an implementation that makes use of [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [encodec.cpp](https://github.com/PABannier/encodec.cpp).
 
-At the moment it's ***very*** barebones as I try and wrestle with `llama.cpp`'s API without needing to modify its code.
+At the moment it's ***very*** work in progress.
 
 ## Build
 
@@ -22,7 +22,6 @@ Run `make`.
 
 * [x] converted model to GGUF
 	* [ ] convert it without modifying any of the existing code, as the tokenizer requires some care
-	* [ ] *actually* convert the model properly, as the embeddings differ from the real model
 * [x] basic framework
 	* [x] load the quantized model
 	* [x] orchestrate the required embeddings
diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp
index 0fbdd0f..33bea36 100644
--- a/vall_e.cpp/vall_e.cpp
+++ b/vall_e.cpp/vall_e.cpp
@@ -40,10 +40,9 @@ std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ) {
 	size_t size = tensor->ne[0] * tensor->ne[1];
 	std::vector<float> res( size );
 	
-	auto* qtype = ggml_get_type_traits(tensor->type);
-	// dequantize if needed
-	if ( ggml_is_quantized(tensor->type) ) {
-		qtype->to_float(tensor->data, res.data(), res.size());
+	auto* type_trait = ggml_get_type_traits(tensor->type);
+	if ( type_trait->to_float ) {
+		type_trait->to_float(tensor->data, res.data(), res.size());
 	} else {
 		memcpy( res.data(), tensor->data, res.size() * sizeof(float) );
 	}
@@ -168,15 +167,16 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) {
 	io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 8));
 
 
+	/*
 	for ( auto& entry : io_ranges ) {
 		for ( auto i = 0; i < 32; ++i ) printf("%s: %i: %f\n", entry.name.c_str(), i, io_map.io[entry.name].embds[i] );
 	}
+	*/
 #else
 	auto* embds = llama_get_embedding_weights( model );
 	auto* heads = llama_get_output_head_tensor( model );
 
 	// prepare slices
-	// std::vector<float> raw_embeddings = read_2d_tensor( embds );
 	for ( auto& entry : io_ranges ) {
 		io_map.io[entry.name] = entry;
 
@@ -184,16 +184,6 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) {
 		io_map.io[entry.name].n_vocab = entry.end - entry.start;
 		io_map.io[entry.name].embds = read_2d_tensor(view_2d_tensor( io_map.ctx, embds, entry.start, entry.end ));
 		io_map.io[entry.name].head = entry.head_idx < 0 ? NULL : view_2d_tensor( io_map.ctx, heads, entry.start, entry.end );	
-
-		// these two differ after the first embedding and I don't know why.........
-		/*
-		auto raw_embd = std::vector<float>( raw_embeddings.data() + entry.start * n_embd, raw_embeddings.data() + entry.end * n_embd );
-		auto sliced_embd = read_2d_tensor( embd_tensor );
-		
-		io_map.io[entry.name].embds = raw_embd;
-		
-		for ( auto i = 0; i < 32; ++i ) printf("%s: %i: %f == %f \n", entry.name.c_str(), i, raw_embd[i], sliced_embd[i] );
-		*/
 	}
 #endif
 }
@@ -524,6 +514,7 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
 				fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
 				return output_tokens;
 			}
+			llama_kv_cache_clear(ctx);
 
 			// sample token
 			auto t = llama_sampler_sample(smpl, ctx, -1);
@@ -618,6 +609,7 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
 				fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
 				return output_tokens;
 			}
+			llama_kv_cache_clear(ctx);
 			// copy null probabilities
 			std::vector<float> null_logits(n_outputs * n_vocab, -INFINITY);
 			// to-do: copy once
@@ -630,6 +622,7 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
 				fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
 				return output_tokens;
 			}
+			llama_kv_cache_clear(ctx);
 			// to-do: figure out why all logits are the same for each token......
 			// "reverse" iterate from backwards indexing
 			for ( auto idx = 0; idx < n_outputs; ++idx ) {
@@ -657,11 +650,13 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
 					printf("%i, ", t);
 					fflush(stdout);
 				}
+
 			}
 			if ( verbose ) {
 				printf("\n");
 				fflush(stdout);
 			}
+
 		}
 	} else if ( mode == INFERENCE_MODE_NAR ) {
 		// to-do: assert n_outputs == input.resp[rvq_l-1].size()
@@ -671,6 +666,7 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
 			fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
 			return output_tokens;
 		}
+		llama_kv_cache_clear(ctx);
 		// to-do: figure out why all logits are the same for each token......
 		// "reverse" iterate from backwards indexing
 		if ( verbose ) {