From 40c101a25649b022de8a29d0fe37fee6c24176a8 Mon Sep 17 00:00:00 2001 From: mrq Date: Mon, 23 Dec 2024 21:04:22 -0600 Subject: [PATCH] nvm fixed --- vall_e.cpp/README.md | 3 +-- vall_e.cpp/vall_e.cpp | 25 ++++++++++--------------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/vall_e.cpp/README.md b/vall_e.cpp/README.md index 916df18..c8158d3 100644 --- a/vall_e.cpp/README.md +++ b/vall_e.cpp/README.md @@ -2,7 +2,7 @@ This is an implementation that makes use of [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [encodec.cpp](https://github.com/PABannier/encodec.cpp). -At the moment it's ***very*** barebones as I try and wrestle with `llama.cpp`'s API without needing to modify its code. +At the moment it's ***very*** work in progress. ## Build @@ -22,7 +22,6 @@ Run `make`. * [x] converted model to GGUF * [ ] convert it without modifying any of the existing code, as the tokenizer requires some care - * [ ] *actually* convert the model properly, as the embeddings differ from the real model * [x] basic framework * [x] load the quantized model * [x] orchestrate the required embeddings diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp index 0fbdd0f..f905c06 100644 --- a/vall_e.cpp/vall_e.cpp +++ b/vall_e.cpp/vall_e.cpp @@ -40,10 +40,9 @@ std::vector VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ) { size_t size = tensor->ne[0] * tensor->ne[1]; std::vector res( size ); - auto* qtype = ggml_get_type_traits(tensor->type); - // dequantize if needed - if ( ggml_is_quantized(tensor->type) ) { - qtype->to_float(tensor->data, res.data(), res.size()); + auto* type_trait = ggml_get_type_traits(tensor->type); + if ( type_trait->to_float ) { + type_trait->to_float(tensor->data, res.data(), res.size()); } else { memcpy( res.data(), tensor->data, res.size() * sizeof(float) ); } @@ -168,15 +167,16 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) { io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 8)); + /* for ( auto& entry : io_ranges ) { for ( auto i = 0; i < 32; ++i ) printf("%s: %i: %f\n", entry.name.c_str(), i, io_map.io[entry.name].embds[i] ); } + */ #else auto* embds = llama_get_embedding_weights( model ); auto* heads = llama_get_output_head_tensor( model ); // prepare slices - // std::vector raw_embeddings = read_2d_tensor( embds ); for ( auto& entry : io_ranges ) { io_map.io[entry.name] = entry; @@ -184,16 +184,6 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) { io_map.io[entry.name].n_vocab = entry.end - entry.start; io_map.io[entry.name].embds = read_2d_tensor(view_2d_tensor( io_map.ctx, embds, entry.start, entry.end )); io_map.io[entry.name].head = entry.head_idx < 0 ? NULL : view_2d_tensor( io_map.ctx, heads, entry.start, entry.end ); - - // these two differ after the first embedding and I don't know why......... - /* - auto raw_embd = std::vector( raw_embeddings.data() + entry.start * n_embd, raw_embeddings.data() + entry.end * n_embd ); - auto sliced_embd = read_2d_tensor( embd_tensor ); - - io_map.io[entry.name].embds = raw_embd; - - for ( auto i = 0; i < 32; ++i ) printf("%s: %i: %f == %f \n", entry.name.c_str(), i, raw_embd[i], sliced_embd[i] ); - */ } #endif } @@ -546,6 +536,7 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m printf("]\n"); fflush(stdout); } + llama_kv_cache_clear(ctx); } else if ( mode == INFERENCE_MODE_NAR_DEMASK ) { // to-do: assert n_outputs == input.resp[rvq_l-1].size() const llama_token MASK_TOKEN = 1024; // token value for masking @@ -657,11 +648,14 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m printf("%i, ", t); fflush(stdout); } + } if ( verbose ) { printf("\n"); fflush(stdout); } + + llama_kv_cache_clear(ctx); } } else if ( mode == INFERENCE_MODE_NAR ) { // to-do: assert n_outputs == input.resp[rvq_l-1].size() @@ -688,6 +682,7 @@ std::vector VALL_E_API generate( llama_context* ctx, llama_model* m fflush(stdout); } } + llama_kv_cache_clear(ctx); if ( verbose ) { printf("]\n"); fflush(stdout);