more work on vall_e.cpp (some more cleanup, NAR-len demasking, but still need to iron out some kinks)

2024-12-23 17:20:04 -06:00 · 2024-12-23 17:20:04 -06:00 · 6ecdb715b6
commit 6ecdb715b6
parent a6945f981d
3 changed files with 279 additions and 131 deletions
--- a/vall_e.cpp/README.md
+++ b/vall_e.cpp/README.md
@ -15,17 +15,8 @@ Run `make`.
 ### Required Modifications

 [`encodec.cpp`](https://github.com/e-c-k-e-r/encodec.cpp) requires updating its GGML copy to the latest version, which requires a few lines to get the CPU backend working.
-[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) *might* not require any modifications, but:
-* `llm.build_vall_e` can mostly copy `llm.build_llama`, but with:
-	* `KQ_mask = build_inp_KQ_mask( lctx.cparams.causal_attn )`
-	* a unified output head (pain)
-		* OR adjusting the `model.output` to the correct classifier head (better option)
-	    * OR slicing that tensor with the right range (`ggml_view_2d` confuses me)
-		* both require also require `*const_cast<uint32_t*>(&ctx->model.hparams.n_vocab) = output->ne[1];` because the logits are tied to `n_vocab`
-* commenting out `GGML_ABORT("input/output layer tensor %s used with a layer number", tn.str().c_str());` because grabbing embeddings/classifiers require using `bid` to trick it thinking it's part of a layer
-* some helper functions to retrieve the embeddings tensor from the model
-* some helper functions to set the target classifier head
-* some fix for `GGML_ASSERT(mask->ne[0] == a->ne[0])` when using a non-causal attention mask (or I can test on the model that had a causal NAR......)
+
+[`llama.cpp`](https://github.com/e-c-k-e-r/llama.cpp) *might* not require any modifications, but implementing `LLM_ARCH_VALL_E` requires some surgery.

 ## To-Do

@ -46,11 +37,11 @@ Run `make`.
 	* [x] `AR` sampling
 	* currently need a model that didn't regress with the `AR:0:0` output
 * [ ] working `NAR-len` output
-	* [ ] `NAR-len` sampling
-	* currently cannot inference with non-causal_attn
+	* [x] `NAR-len` sampling
+	* need to assert that a non-causal mask is used
 * [ ] working `NAR` output
 	* [x] `NAR` sampling
-	* currently cannot inference with non-causal_attn
+	* need to assert that a non-causal mask is used
 * [x] decode audio to disk
 * [ ] a functional CLI
 * [ ] actually make it work
--- a/vall_e.cpp/vall_e.cpp
+++ b/vall_e.cpp/vall_e.cpp
@ -1,14 +1,11 @@
 #define DR_WAV_IMPLEMENTATION
 #include "vall_e.h"

-
-
-#define LLAMA_CPP_EXTENDED 1 // whether the underlying llama.cpp has some extra functions
-#define LLAMA_CPP_USE_VALL_E_ARCH 1 // whether the underlying llama.cpp is to use the VALL_E arch (or using LLAMA arch)
-
-#if !LLAMA_CPP_EXTENDED
-	#include "_llama.h" // cringe hotfix but I have to do this until llama.cpp's API exposes the tok_embd
-#endif
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <iostream>
+#include <algorithm>

 ranges_t io_ranges[] = {
 	{ "text", 0, 256, 9, }, 
@ -39,7 +36,7 @@ ranges_t io_ranges[] = {
 	{ "resps|NAR:0: 16677, 17702, 8,0", }, 
 };

-std::vector<float> read_2d_tensor( struct ggml_tensor* tensor ) {
+std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ) {
 	size_t size = tensor->ne[0] * tensor->ne[1];
 	std::vector<float> res( size );
 	
@ -55,29 +52,29 @@ std::vector<float> read_2d_tensor( struct ggml_tensor* tensor ) {
 }


-struct ggml_tensor * vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
+struct ggml_tensor * VALL_E_API  vall_e_get_prom_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
    return userdata.prom_embds[idx];
 }
-struct ggml_tensor * vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
+struct ggml_tensor * VALL_E_API  vall_e_get_resp_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
    return userdata.resp_embds[idx];
 }
-struct ggml_tensor * vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
+struct ggml_tensor * VALL_E_API  vall_e_get_aux_embds( llama_vall_e_userdata& userdata, int32_t idx ) {
    return userdata.aux_embds[idx];
 }


-const embeddings_t& vall_e_inputs_map_get_embeddings( inputs_map_t& inputs_map, const std::string& name ) {
+const embeddings_t& VALL_E_API vall_e_inputs_map_get_embeddings( inputs_map_t& inputs_map, const std::string& name ) {
 	return inputs_map.embds[name];
 }
-const float* vall_e_inputs_map_get_embeddings_p( inputs_map_t& inputs_map, const std::string& name ) {
+const float* VALL_E_API vall_e_inputs_map_get_embeddings_p( inputs_map_t& inputs_map, const std::string& name ) {
 	return inputs_map.embds[name].embds.data();	
 }

-int32_t vall_e_inputs_map_get_classifier_idx( inputs_map_t& inputs_map, const std::string& name ) {
+int32_t VALL_E_API vall_e_inputs_map_get_classifier_idx( inputs_map_t& inputs_map, const std::string& name ) {
 	return inputs_map.embds[name].range.classifier_idx;
 }

-void vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) {
+void VALL_E_API vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) {
 	auto n_embd = llama_n_embd( model );
 	auto n_vocab = llama_n_vocab( model );
 	
@ -146,7 +143,7 @@ void vall_e_inputs_map_init( inputs_map_t& inputs_map, llama_model* model ) {
 }

 // maps embeddings easily
-std::vector<std::vector<float>> map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds ) {
+std::vector<std::vector<float>> VALL_E_API map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds ) {
 	std::vector<std::vector<float>> embedded( tokens.size() );
 	for ( auto i = 0; i < tokens.size(); ++i ) {
 		embedded[i].insert( embedded[i].end(), embds + (tokens[i] * n_embd), embds + ((tokens[i]+1) * n_embd) );
@ -156,7 +153,7 @@ std::vector<std::vector<float>> map_embeddings( const std::vector<llama_token>&

 // handles adding either a token OR the embedding of that token into the batch
 // this really, really helps avoid needing to abuse the tokenizer
-void batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector<llama_seq_id> & seq_ids ) {
+void VALL_E_API batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector<llama_seq_id> & seq_ids ) {
 	GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded");

 	// insert raw embedding instead
@ -181,7 +178,7 @@ void batch_add( llama_batch& batch, llama_token id, int n_embd, const float* emb
 	batch.n_tokens++;
 }
 // reads a waveform from disk
-bool read_wav_from_disk(std::string in_path, std::vector<float> & audio_arr) {
+bool VALL_E_API read_wav_from_disk(std::string in_path, std::vector<float> & audio_arr) {
    uint32_t channels;
    uint32_t sample_rate;
    drwav_uint64 total_frame_count;
@ -209,7 +206,7 @@ bool read_wav_from_disk(std::string in_path, std::vector<float> & audio_arr) {
    return true;
 }
 // writes a waveform to disk
-void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
+void VALL_E_API write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
    drwav_data_format format;
    format.bitsPerSample = 32;
    format.sampleRate = 24000;
@ -225,7 +222,7 @@ void write_wav_on_disk(std::vector<float> & audio_arr, std::string dest_path) {
    fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames);
 }
 // reads a waveform from disk then encodes it
-std::vector<std::vector<int32_t>> encode_audio_from_disk( struct encodec_context* ectx, const std::string& path ) {
+std::vector<std::vector<int32_t>> VALL_E_API encode_audio_from_disk( struct encodec_context* ectx, const std::string& path ) {
 	// read audio from disk
    std::vector<float> wavform;

@ -258,7 +255,7 @@ std::vector<std::vector<int32_t>> encode_audio_from_disk( struct encodec_context
    return codes_2ds;
 }
 // decodes a 2D codebook into a waveform
-std::vector<float> decode_audio( struct encodec_context* ectx, const std::vector<std::vector<int32_t>>& codes_2d ) {
+std::vector<float> VALL_E_API decode_audio( struct encodec_context* ectx, const std::vector<std::vector<int32_t>>& codes_2d ) {
    int n_codebooks = codes_2d.size();
    int n_frames = codes_2d[0].size();
 	
@ -283,7 +280,7 @@ std::vector<float> decode_audio( struct encodec_context* ectx, const std::vector
 }

 // sums embeddings over a 2D "tensor"
-std::vector<std::vector<float>> sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode ) {
+std::vector<std::vector<float>> VALL_E_API sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode ) {
 	std::vector<std::vector<float>> res( input.size() );
 	res.resize( input[0].size() );
 	for ( auto& e : res ) e.resize( n_embd );
@ -311,7 +308,22 @@ std::vector<std::vector<float>> sum_embeddings( const std::vector<std::vector<ll
 	return res;
 }

-void fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, int mode ) {
+std::vector<float> VALL_E_API soft_max( int n_logits, const float* logits ) {
+	std::vector<float> res( n_logits, 0.0f );
+	float denom = 0.0f;
+
+	for ( auto i = 0; i < n_logits; ++i ) {
+		denom += exp( logits[i] );
+	}
+	// to-do: assert denom != 0.0f
+	for ( auto i = 0; i < n_logits; ++i ) {
+		res[i] = logits[i] / denom;
+	}
+
+	return res;
+}
+
+void VALL_E_API fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, int mode ) {
 	// keeps track of the position for each sequence
 	size_t pos = 0;
 	auto n_embd = inputs_map.n_embd;
@ -382,48 +394,42 @@ void fill_batch( llama_batch& batch, input_t& input, inputs_map_t& inputs_map, i
 }

 // generation code, should handle all modalities easily
-std::vector<llama_token> generate( llama_context* ctx, llama_model* model, llama_sampler* smpl, input_t& input, inputs_map_t& inputs_map, int max_tokens, int mode, bool verbose ) {
-	llama_batch batch = llama_batch_init( 22500, inputs_map.n_embd, 22500 );
-
-	// Decoding loop
-	const auto t_main_start = ggml_time_us();
-	int n_decode = 0;
+std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* model, llama_sampler* smpl, input_t& input, inputs_map_t& inputs_map, int max_tokens, int mode, bool verbose ) {
 	int rvq_l = input.rvq_l;
 	llama_token stop_token = -1;
-	
+	int n_decode = 0; // number of tokens decoded
+	int n_outputs = 0; // number of output tokens to expect
+	int n_vocab = 0;
+	int n_embd = 0;
+	bool causal = true; // sample autoregressively or not
+	const float* embds = NULL; // embeddings to map output tokens through
+	ranges_t range; // I/O range
+
+	// create batch	(targetting embeddings instead of tokens)
+	llama_batch batch = llama_batch_init( CTX_SIZE, inputs_map.n_embd, CTX_SIZE );
 	fill_batch( batch, input, inputs_map, mode );

-	// determine how many logits we need
-	int n_logits = 0;
+	// determine how many outputs we need
 	for ( auto i = 0; i < batch.n_tokens; ++i ) {
-		if ( batch.logits[i] ) ++n_logits;
+		if ( batch.logits[i] ) ++n_outputs;
 	}
-	
-	if ( verbose ) printf("Prompt size: %i | Outputs: %i\n", batch.n_tokens, n_logits);
+	if ( verbose ) printf("Prompt size: %i | Outputs: %i\n", batch.n_tokens, n_outputs);

-	// NAR mode, cap at one step
-	if ( n_logits > 1 ) {
-		max_tokens = n_logits;
-	}
-
-	if ( n_logits == 0 ) {
+	// bail out
+	if ( n_outputs == 0 ) {
 		fprintf(stderr, "%s : no tokens to decode\n", __func__);
 		return {};
 	}
+	causal = n_outputs == 1;

-	const float* embds = NULL;
-	ranges_t range;
-
+	// AR mode
+	std::string embd_name = "";
 	if ( mode == INFERENCE_MODE_AR ) {
-		auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "resps|AR:0:0");
-		range = embeddings.range;
-		embds = embeddings.embds.data();
-		stop_token = range.end - range.start - 1;
-
-		printf("Generating in %s (%i) mode (%i:%i) (%i)\n", "AR", range.classifier_idx, range.start, range.end, stop_token);
+		embd_name = "resps|AR:0:0";
+	// NAR mode
 	} else if ( mode == INFERENCE_MODE_NAR ) {
 		std::string k_embds[] = {
-			"resps|NAR:0:0", // invalid
+			"resps|NAR:0:0", // invalid, should never be picked
 			"resps|NAR:0:1",
 			"resps|NAR:1:2",
 			"resps|NAR:2:3",
@ -432,88 +438,237 @@ std::vector<llama_token> generate( llama_context* ctx, llama_model* model, llama
 			"resps|NAR:5:6",
 			"resps|NAR:6:7",
 		};
-		auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, k_embds[rvq_l]);
-		range = embeddings.range;
-		embds = embeddings.embds.data();
-		
-		printf("Generating in %s (%i) mode (%i:%i)\n", "NAR", range.classifier_idx, range.start, range.end);
+		embd_name = k_embds[rvq_l];
+	// duration inferencing mode
 	} else if ( mode == INFERENCE_MODE_LEN ) {
-		auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "len");
-		range = embeddings.range;
-		embds = embeddings.embds.data();
-		stop_token = range.end - range.start - 1;
-		
-		printf("Generating in %s (%i) mode (%i:%i) (%i)\n", "len", range.classifier_idx, range.start, range.end, stop_token);
+		embd_name = "len";
+	// NAR-len (demasking) inferencing mode
 	} else if ( mode == INFERENCE_MODE_NAR_DEMASK ) {
-		auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, "resps|NAR:0:0");
-		range = embeddings.range;
-		embds = embeddings.embds.data();
-
-		printf("Generating in %s (%i) mode (%i:%i)\n", "NAR-len", range.classifier_idx, range.start, range.end);
+		embd_name = "resps|NAR:0:0";
 	}

+	auto& embeddings = vall_e_inputs_map_get_embeddings(inputs_map, embd_name);
+	range = embeddings.range;
+	embds = embeddings.embds.data();
+	n_embd = embeddings.n_embd;
+	n_vocab = embeddings.n_vocab;
+	stop_token = range.end - range.start - 1;
+
+	printf("Generating in %s (%i) mode (%i:%i) (%i)\n", embd_name.c_str(), range.classifier_idx, range.start, range.end, stop_token);
+
+	// update model's output heads / causal mode
 #if LLAMA_CPP_USE_VALL_E_ARCH
 	auto& userdata = *llama_get_vall_e_userdata( model );
 	llama_set_output_head( model, userdata.heads[range.classifier_idx] );
 #endif
-	llama_set_causal_attn( ctx, n_logits == 1 );
+	llama_set_causal_attn( ctx, causal );
 	// to-do: fix GGML_ASSERT(mask->ne[0] == a->ne[0])

 	std::vector<llama_token> output_tokens;
-	while ( output_tokens.size() < max_tokens ) {
-		if (llama_decode(ctx, batch)) {
+	const auto t_main_start = ggml_time_us();
+
+	// if INFERENCE_MODE_AR || INFERENCE_MODE_LEN
+	if ( causal ) {
+		output_tokens.reserve(max_tokens);
+		if ( verbose ) {
+			printf("[");
+			fflush(stdout);
+		}
+		while ( output_tokens.size() < max_tokens ) {
+			if ( llama_decode(ctx, batch) ) {
+				fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+				return output_tokens;
+			}
+
+			// ensures only tokens within our designated range are used			
+		#if !LLAMA_CPP_USE_VALL_E_ARCH
+			auto* logits = llama_get_logits_ith( ctx, -1 );
+			for ( auto i = 0; i < inputs_map.n_vocab; ++i ) {
+				if ( i < range.start || i >= range.end ) logits[i] = -INFINITY;
+			}
+		#endif
+			// sample token
+			auto t = llama_sampler_sample(smpl, ctx, -1);
+
+			// is stop token
+			if ( t == stop_token ) {
+				break;
+			}
+
+			// store token
+			output_tokens.emplace_back(t);
+			// update batch with token
+			batch_add( batch, t, inputs_map.n_embd, embds, output_tokens.size(), true );
+			if ( verbose ) {
+				printf("%i, ", t);
+				fflush(stdout);
+			}
+		}
+		if ( verbose ) {
+			printf("]\n");
+			fflush(stdout);
+		}
+	} else if ( mode == INFERENCE_MODE_NAR_DEMASK ) {
+		// to-do: assert n_outputs == input.resp[rvq_l-1].size()
+		const llama_token MASK_TOKEN = 1024; // token value for masking 
+		const float PI = 3.141592653589793f;
+		// to-do: derive from sampling arguments
+		int32_t steps = 30; // number of demasking steps
+		int32_t seq_len = n_outputs;
+		float temperature = 1.5f;
+		float cfg_strength = 2.5f;
+
+		// fill with masked tokens
+		output_tokens.clear();
+		output_tokens.resize(n_outputs, MASK_TOKEN);
+
+		// for CFG
+		input_t null_input{};
+		null_input.phn = {1, 2}; // <bos></eos>
+		null_input.resp.resize(1);
+
+		llama_batch null_batch = llama_batch_init( CTX_SIZE, inputs_map.n_embd, CTX_SIZE );
+		
+		// token scores to reference for masking
+		std::vector<float> scores(n_outputs, 1.0);
+
+		// do one step on many tokens
+		for ( auto step = 0; step < steps; ++step ) {
+			if ( verbose ) {
+				printf("[%i/%i] [", step, steps);
+				fflush(stdout);
+			}
+
+			float timestep = (step+1) / steps; // to-do: align with torch.linspace
+			float annealing = 1.0f - timestep;
+			float noise_p = cos( timestep * PI * 0.5f );
+			float remask_p = 0.5f / steps;
+			int32_t n_masked_tokens = std::min(int(noise_p * seq_len), 1);
+			float sampling_temperature = temperature * annealing;
+			float sampling_cfg_strength = timestep * cfg_strength;
+
+			std::vector<bool> is_masked(n_outputs, false);
+			std::vector<int32_t> masked_indices;
+			masked_indices.reserve(n_masked_tokens);
+			std::vector<float> sorted = scores;
+			std::sort(sorted.begin(), sorted.end());
+			masked_indices.insert( masked_indices.end(), sorted.begin(), sorted.begin() + n_masked_tokens );
+
+			// mask off tokens
+			for ( auto& idx : masked_indices ) {
+				output_tokens[idx] = MASK_TOKEN;
+			}
+			// update token mask
+			for ( auto i = 0; i < n_outputs; ++i ) {
+				is_masked[i] = output_tokens[i] == MASK_TOKEN;
+			}
+
+			// update batch
+			// to-do: only update the embeddings instead
+			batch.n_tokens = 0;
+			input.resp[0] = output_tokens;
+			fill_batch( batch, input, inputs_map, mode );
+			// update null batch
+			null_input.resp[0] = output_tokens;
+			null_batch.n_tokens = 0;
+			fill_batch( null_batch, input, inputs_map, mode );
+
+			// to-do: update sampling temperature
+
+			// cfg decode
+			if ( llama_decode(ctx, null_batch) ) {
+				fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+				return output_tokens;
+			}
+			// copy null probabilities
+			std::vector<float> null_logits(n_outputs * n_vocab, -INFINITY);
+			// to-do: copy once
+			for ( auto idx = 0; idx < n_outputs; ++idx ) {
+				memcpy( &null_logits[idx * n_vocab], llama_get_logits_ith( ctx, null_batch.n_tokens - n_outputs + idx ), sizeof(float) * n_vocab );
+			}
+
+			// decode
+			if ( llama_decode(ctx, batch) ) {
+				fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+				return output_tokens;
+			}
+			// to-do: figure out why all logits are the same for each token......
+			// "reverse" iterate from backwards indexing
+			for ( auto idx = 0; idx < n_outputs; ++idx ) {
+				// skip if not masked
+				if ( !is_masked[idx] )
+					continue;
+				// ensures only tokens within our designated range are used			
+				auto* logits = llama_get_logits_ith( ctx, batch.n_tokens - n_outputs + idx );
+				auto* null_logit = &null_logits[idx];
+
+			#if !LLAMA_CPP_USE_VALL_E_ARCH
+				for ( auto i = 0; i < inputs_map.n_vocab; ++i ) {
+					if ( i < range.start || i >= range.end ) logits[i] = -INFINITY;
+				}
+			#endif
+				// perform softmax before modifying logits
+				std::vector<float> softmaxed = soft_max( n_vocab, logits );
+
+				// perform CFG sampling
+				for ( auto i = 0; i < n_vocab; ++i ) {
+					logits[i] = null_logit[i] + (logits[i] - null_logit[i]) * cfg_strength;
+				}
+				// sample ith token
+				auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx );
+				// store token if it was masked
+				output_tokens[idx] = t;
+				// update score if it was masked
+				scores[idx] = 1.0f - softmaxed[t]; // invert so we pick the worst tokens later
+				if ( verbose ) {
+					printf("%i, ", t);
+					fflush(stdout);
+				}
+			}
+			if ( verbose ) {
+				printf("\n");
+				fflush(stdout);
+			}
+		}
+	} else if ( mode == INFERENCE_MODE_NAR ) {
+		// to-do: assert n_outputs == input.resp[rvq_l-1].size()
+		output_tokens.reserve(n_outputs);
+		// do one step on many tokens
+		if ( llama_decode(ctx, batch) ) {
 			fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
 			return output_tokens;
 		}
-		std::vector<llama_token> current_tokens;
-		// backwards iterate to start from beginning of sequence
-		for ( auto i = n_logits; i > 0; --i ) {
-			// filter logits
-			auto* logits = llama_get_logits_ith( ctx, -i );
-
-		// ensures only tokens within our designated range are used			
+		// to-do: figure out why all logits are the same for each token......
+		// "reverse" iterate from backwards indexing
+		if ( verbose ) {
+			printf("[");
+			fflush(stdout);
+		}
+		for ( auto idx = 0; idx < n_outputs; ++idx ) {
+			// ensures only tokens within our designated range are used			
 		#if !LLAMA_CPP_USE_VALL_E_ARCH
+			auto* logits = llama_get_logits_ith( ctx, batch.n_tokens - n_outputs + idx  );
 			for ( auto i = 0; i < inputs_map.n_vocab; ++i ) {
 				if ( i < range.start || i >= range.end ) logits[i] = -INFINITY;
 			}
 		#endif

-			// sample the next token
-			printf("%i: %p\n [", -i, logits );
-			for ( auto i = 0; i < 1025; ++i ) {
-				printf("%f, ", logits[i]);
-			}
-			printf("]\n");
-			auto t = llama_sampler_sample(smpl, ctx, -i);
-			//printf("%i: [%i]: %f | %p\n", -i, t, logits[t], logits );
-
-		// offset back into range
-		#if !LLAMA_CPP_USE_VALL_E_ARCH
-			t -= range.start;
-		#endif
-
-			n_decode += 1;
-
-			// is stop token
-			if ( t == stop_token ) {
-				printf("STOPPED\n");
-				max_tokens = 0;
-				break;
-			}
+			// sample ith token
+			auto t = llama_sampler_sample(smpl, ctx, batch.n_tokens - n_outputs + idx);

 			// store token
-			current_tokens.emplace_back(t);
-			// update batch with token
-			batch_add( batch, t, inputs_map.n_embd, embds, output_tokens.size(), true );
+			output_tokens.emplace_back(t);
+			if ( verbose ) {
+				printf("%i, ", t);
+				fflush(stdout);
+			}
 		}
-		printf("%s: Tokens: [", __func__);
-		for ( auto& token : current_tokens ) {
-			printf("%i, ", token);
+		if ( verbose ) {
+			printf("]\n");
+			fflush(stdout);
 		}
-		printf("]\n");
-
-		output_tokens.insert(output_tokens.end(), current_tokens.begin(), current_tokens.end());
 	}
+
 	const auto t_main_end = ggml_time_us();

 	if ( verbose ) {
@ -535,7 +690,7 @@ std::vector<llama_token> generate( llama_context* ctx, llama_model* model, llama
 int main( int argc, char** argv ) {
 	// to-do: replace all of this with proper loading code
 	int32_t ngl = 0;
-	int modality = MODALITY_AR_NAR;
+	int modality = MODALITY_NAR_LEN;
 	input_t input{};
 	inputs_map_t inputs_map{};

@ -632,7 +787,7 @@ int main( int argc, char** argv ) {
 	// NAR-len demasking
 	if ( modality == MODALITY_NAR_LEN ) {
 		// inference len
-		int len = 0;
+		int len = 290;
 		if ( !len ) {
 			input.task = "len";
 			output_tokens = generate( ctx, model, smpl_nar, input, inputs_map, 5, INFERENCE_MODE_LEN );
--- a/vall_e.cpp/vall_e.h
+++ b/vall_e.cpp/vall_e.h
@ -1,23 +1,24 @@
 #pragma once

-#include "llama-vocab.h"
 #include "llama.h"
 #include "encodec.h"

 #include "dr_wav.h"

-#include <cmath>
-#include <cstdio>
-#include <cstring>
 #include <string>
 #include <vector>
-#include <array>
 #include <unordered_map>
-#include <iostream>

 // to-do: copy over import/export stuff from engine project (because I don't remember how I set it up in <uf/config.h>)
 #define VALL_E_API

+#define LLAMA_CPP_EXTENDED 1 // whether the underlying llama.cpp has some extra functions
+#define LLAMA_CPP_USE_VALL_E_ARCH 1 // whether the underlying llama.cpp is to use the VALL_E arch (or using LLAMA arch)
+
+#if !LLAMA_CPP_EXTENDED
+	#include "_llama.h" // cringe hotfix but I have to do this until llama.cpp's API exposes the tok_embd
+#endif
+
 // to-do: clean up spaghetti enums
 const int EMBEDDING_MODE_PROM = 0;
 const int EMBEDDING_MODE_RESP_AR_NAR = 1;
@ -106,6 +107,7 @@ struct inputs_map_t {
 std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor );
 std::vector<std::vector<float>> VALL_E_API map_embeddings( const std::vector<llama_token>& tokens, int n_embd, const float* embds );
 std::vector<std::vector<float>> VALL_E_API sum_embeddings( const std::vector<std::vector<llama_token>>& input, int n_embd, int rvq_l, const float** embds, int mode = EMBEDDING_MODE_PROM );
+std::vector<float> VALL_E_API soft_max( int n_logits, const float* logits );

 // batch and inferencing
 void VALL_E_API batch_add( llama_batch& batch, llama_token id, int n_embd, const float* embds, llama_pos pos, bool output, const std::vector<llama_seq_id> & seq_ids = {0} );