nvm fixed
This commit is contained in:
parent
f62f99b8de
commit
40c101a256
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
This is an implementation that makes use of [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [encodec.cpp](https://github.com/PABannier/encodec.cpp).
|
This is an implementation that makes use of [llama.cpp](https://github.com/ggerganov/llama.cpp/) and [encodec.cpp](https://github.com/PABannier/encodec.cpp).
|
||||||
|
|
||||||
At the moment it's ***very*** barebones as I try and wrestle with `llama.cpp`'s API without needing to modify its code.
|
At the moment it's ***very*** work in progress.
|
||||||
|
|
||||||
## Build
|
## Build
|
||||||
|
|
||||||
|
@ -22,7 +22,6 @@ Run `make`.
|
||||||
|
|
||||||
* [x] converted model to GGUF
|
* [x] converted model to GGUF
|
||||||
* [ ] convert it without modifying any of the existing code, as the tokenizer requires some care
|
* [ ] convert it without modifying any of the existing code, as the tokenizer requires some care
|
||||||
* [ ] *actually* convert the model properly, as the embeddings differ from the real model
|
|
||||||
* [x] basic framework
|
* [x] basic framework
|
||||||
* [x] load the quantized model
|
* [x] load the quantized model
|
||||||
* [x] orchestrate the required embeddings
|
* [x] orchestrate the required embeddings
|
||||||
|
|
|
@ -40,10 +40,9 @@ std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ) {
|
||||||
size_t size = tensor->ne[0] * tensor->ne[1];
|
size_t size = tensor->ne[0] * tensor->ne[1];
|
||||||
std::vector<float> res( size );
|
std::vector<float> res( size );
|
||||||
|
|
||||||
auto* qtype = ggml_get_type_traits(tensor->type);
|
auto* type_trait = ggml_get_type_traits(tensor->type);
|
||||||
// dequantize if needed
|
if ( type_trait->to_float ) {
|
||||||
if ( ggml_is_quantized(tensor->type) ) {
|
type_trait->to_float(tensor->data, res.data(), res.size());
|
||||||
qtype->to_float(tensor->data, res.data(), res.size());
|
|
||||||
} else {
|
} else {
|
||||||
memcpy( res.data(), tensor->data, res.size() * sizeof(float) );
|
memcpy( res.data(), tensor->data, res.size() * sizeof(float) );
|
||||||
}
|
}
|
||||||
|
@ -168,15 +167,16 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) {
|
||||||
io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 8));
|
io_map.io["resps|NAR:0:0"].embds = read_2d_tensor(vall_e_get_resp_embds(userdata, 8));
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
for ( auto& entry : io_ranges ) {
|
for ( auto& entry : io_ranges ) {
|
||||||
for ( auto i = 0; i < 32; ++i ) printf("%s: %i: %f\n", entry.name.c_str(), i, io_map.io[entry.name].embds[i] );
|
for ( auto i = 0; i < 32; ++i ) printf("%s: %i: %f\n", entry.name.c_str(), i, io_map.io[entry.name].embds[i] );
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
#else
|
#else
|
||||||
auto* embds = llama_get_embedding_weights( model );
|
auto* embds = llama_get_embedding_weights( model );
|
||||||
auto* heads = llama_get_output_head_tensor( model );
|
auto* heads = llama_get_output_head_tensor( model );
|
||||||
|
|
||||||
// prepare slices
|
// prepare slices
|
||||||
// std::vector<float> raw_embeddings = read_2d_tensor( embds );
|
|
||||||
for ( auto& entry : io_ranges ) {
|
for ( auto& entry : io_ranges ) {
|
||||||
io_map.io[entry.name] = entry;
|
io_map.io[entry.name] = entry;
|
||||||
|
|
||||||
|
@ -184,16 +184,6 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) {
|
||||||
io_map.io[entry.name].n_vocab = entry.end - entry.start;
|
io_map.io[entry.name].n_vocab = entry.end - entry.start;
|
||||||
io_map.io[entry.name].embds = read_2d_tensor(view_2d_tensor( io_map.ctx, embds, entry.start, entry.end ));
|
io_map.io[entry.name].embds = read_2d_tensor(view_2d_tensor( io_map.ctx, embds, entry.start, entry.end ));
|
||||||
io_map.io[entry.name].head = entry.head_idx < 0 ? NULL : view_2d_tensor( io_map.ctx, heads, entry.start, entry.end );
|
io_map.io[entry.name].head = entry.head_idx < 0 ? NULL : view_2d_tensor( io_map.ctx, heads, entry.start, entry.end );
|
||||||
|
|
||||||
// these two differ after the first embedding and I don't know why.........
|
|
||||||
/*
|
|
||||||
auto raw_embd = std::vector<float>( raw_embeddings.data() + entry.start * n_embd, raw_embeddings.data() + entry.end * n_embd );
|
|
||||||
auto sliced_embd = read_2d_tensor( embd_tensor );
|
|
||||||
|
|
||||||
io_map.io[entry.name].embds = raw_embd;
|
|
||||||
|
|
||||||
for ( auto i = 0; i < 32; ++i ) printf("%s: %i: %f == %f \n", entry.name.c_str(), i, raw_embd[i], sliced_embd[i] );
|
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
@ -546,6 +536,7 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
|
||||||
printf("]\n");
|
printf("]\n");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
} else if ( mode == INFERENCE_MODE_NAR_DEMASK ) {
|
} else if ( mode == INFERENCE_MODE_NAR_DEMASK ) {
|
||||||
// to-do: assert n_outputs == input.resp[rvq_l-1].size()
|
// to-do: assert n_outputs == input.resp[rvq_l-1].size()
|
||||||
const llama_token MASK_TOKEN = 1024; // token value for masking
|
const llama_token MASK_TOKEN = 1024; // token value for masking
|
||||||
|
@ -657,11 +648,14 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
|
||||||
printf("%i, ", t);
|
printf("%i, ", t);
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if ( verbose ) {
|
if ( verbose ) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
}
|
}
|
||||||
} else if ( mode == INFERENCE_MODE_NAR ) {
|
} else if ( mode == INFERENCE_MODE_NAR ) {
|
||||||
// to-do: assert n_outputs == input.resp[rvq_l-1].size()
|
// to-do: assert n_outputs == input.resp[rvq_l-1].size()
|
||||||
|
@ -688,6 +682,7 @@ std::vector<llama_token> VALL_E_API generate( llama_context* ctx, llama_model* m
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
llama_kv_cache_clear(ctx);
|
||||||
if ( verbose ) {
|
if ( verbose ) {
|
||||||
printf("]\n");
|
printf("]\n");
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
Loading…
Reference in New Issue
Block a user