From 503124d0d30f382838a551ae8acc20c12fa921ff Mon Sep 17 00:00:00 2001 From: mrq Date: Sat, 21 Dec 2024 15:48:12 -0600 Subject: [PATCH] crammed encodec.cpp in --- .gitignore | 7 ++- vall_e.cpp/Makefile | 22 +++++++++ vall_e.cpp/README.md | 8 +++- vall_e.cpp/vall_e.cpp | 101 ++++++++++++++++++++++++++++++++++++++---- 4 files changed, 127 insertions(+), 11 deletions(-) create mode 100644 vall_e.cpp/Makefile diff --git a/.gitignore b/.gitignore index 6431992..b6b1a4a 100755 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,9 @@ __pycache__ /.cache /voices /wandb -/.nltk \ No newline at end of file +/.nltk +/vall_e.cpp/data +/vall_e.cpp/include +/vall_e.cpp/libs +/vall_e.cpp/*.o +/vall_e.cpp/vall_e \ No newline at end of file diff --git a/vall_e.cpp/Makefile b/vall_e.cpp/Makefile new file mode 100644 index 0000000..77ca38c --- /dev/null +++ b/vall_e.cpp/Makefile @@ -0,0 +1,22 @@ +CXX = g++ + +INCS += -I./include +LIBS += -L./libs + +LINKS += -lggml -lggml-base -lllama -lencodec +FLAGS += -g + +SRCS := $(shell find ./ -name "*.cpp") +OBJS += $(patsubst %.cpp,%.o,$(SRCS)) + +TARGET = vall_e + +%.o: %.cpp + $(CXX) $(FLAGS) $(INCS) -c $< -o $@ + +$(TARGET): $(OBJS) + $(CXX) $(FLAGS) $(OBJS) $(LIBS) $(INCS) $(LINKS) -o $(TARGET) + +clean: + @-rm -f $(OBJS) + @-rm -f $(TARGET) \ No newline at end of file diff --git a/vall_e.cpp/README.md b/vall_e.cpp/README.md index cd8611e..443fe45 100644 --- a/vall_e.cpp/README.md +++ b/vall_e.cpp/README.md @@ -6,9 +6,13 @@ At the moment it's ***very*** barebones as I try and wrestle with `llama.cpp`'s ## Build -Probably something like: +Populate `./include/` with the `llama.cpp` and `encodec.cpp` headers. -`g++ -I/path/to/llama.cpp/include/ -L/path/to/llama.cpp/libllama.so -lggml -lggml-base -lllama -o ./vall_e` +Populate `./libs/` with the compiled libraries of `llama.cpp` and `encodec.cpp`. +* `encodec.cpp` requires updating `ggml` to the latest version and doing a quick hack to make it work on the CPU backend. +* `llama.cpp` currently requires no hacks, but would be *very* nice to hack in a way to retrieve a model's `tok_embd`. + +Run `make`. ## To-Do diff --git a/vall_e.cpp/vall_e.cpp b/vall_e.cpp/vall_e.cpp index bb8e540..c11651b 100644 --- a/vall_e.cpp/vall_e.cpp +++ b/vall_e.cpp/vall_e.cpp @@ -1,5 +1,9 @@ #include "llama-vocab.h" #include "llama.h" +#include "encodec.h" + +#define DR_WAV_IMPLEMENTATION +#include "dr_wav.h" #include #include @@ -140,9 +144,7 @@ void batch_add( struct llama_batch& batch, llama_token id, int n_embd, float* em GGML_ASSERT(batch.seq_id[batch.n_tokens] && "llama_batch size exceeded"); if ( embds ) { - for ( auto i = 0; i < n_embd; ++i ) { - batch.embd[batch.n_tokens + i] = embds[id * n_embd + i]; - } + for ( auto i = 0; i < n_embd; ++i ) batch.embd[batch.n_tokens + i] = embds[id * n_embd + i]; } else { batch.token[batch.n_tokens] = id; } @@ -156,6 +158,77 @@ void batch_add( struct llama_batch& batch, llama_token id, int n_embd, float* em batch.n_tokens++; } +bool read_wav_from_disk(std::string in_path, std::vector & audio_arr) { + uint32_t channels; + uint32_t sample_rate; + drwav_uint64 total_frame_count; + + float * raw_audio = drwav_open_file_and_read_pcm_frames_f32( + in_path.c_str(), &channels, &sample_rate, &total_frame_count, NULL); + + if (raw_audio == NULL) { + fprintf(stderr, "%s: could not read wav file\n", __func__); + return false; + } + + fprintf(stderr, "\n%s: Number of frames read = %lld.\n", __func__, total_frame_count); + + audio_arr.resize(total_frame_count); + memcpy(audio_arr.data(), raw_audio, total_frame_count * sizeof(float)); + + drwav_free(raw_audio, NULL); + + return true; +} + +void write_wav_on_disk(std::vector & audio_arr, std::string dest_path) { + drwav_data_format format; + format.bitsPerSample = 32; + format.sampleRate = 24000; + format.container = drwav_container_riff; + format.channels = 1; + format.format = DR_WAVE_FORMAT_IEEE_FLOAT; + + drwav wav; + drwav_init_file_write(&wav, dest_path.c_str(), &format, NULL); + drwav_uint64 frames = drwav_write_pcm_frames(&wav, audio_arr.size(), audio_arr.data()); + drwav_uninit(&wav); + + fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames); +} + +std::vector encode_audio( struct encodec_context* ectx, const std::string& path ) { + // read audio from disk + std::vector wavform; + + if(!read_wav_from_disk(path, wavform)) { + printf("%s: error during reading wav file\n", __func__); + return {}; + } + + // compress audio + if (!encodec_compress_audio(ectx, wavform.data(), wavform.size(), 1)) { + printf("%s: error during compression \n", __func__); + return {}; + } + + int32_t* codes_data = encodec_get_codes( ectx ); + int n_codes = encodec_get_codes_size( ectx ); + + return std::vector(codes_data, codes_data + n_codes); +} +std::vector decode_audio( struct encodec_context* ectx, const std::vector& codes ) { + // decompress audio + if (!encodec_decompress_audio(ectx, codes.data(), codes.size(), 1)) { + printf("%s: error during decompression\n", __func__); + return {}; + } + + // write reconstructed audio on disk + const float* audio_data = encodec_get_audio(ectx); + const int audio_size = encodec_get_audio_size(ectx); + return std::vector(audio_data, audio_data + audio_size); +} int main(int argc, char ** argv) { bool is_ar = true; // to-do: replace all of this with proper loading code @@ -168,16 +241,27 @@ int main(int argc, char ** argv) { std::vector> response_tokens = { {922,395,869,869,354,989,762,762,762,610,975,626,626,866,609,442,762,762,762,610,610,610,610,212,869,869,51,336,352,352,352,570,148,893,76,535,568,568,270,568,568,560,597,86,744,744,744,203,738,408,1019,700,707,92,707,464,744,171,171,159,196,192,697,261,261,568,638,605,904,904,779,832,570,519,223,459,459,459,459,90,90,570,700,53,372,621,610,869,473,869,917,654,473,917,893,654,644,384,558,911,864,521,1,19,665}, }; - std::string model_path = "./vall_e/Vall_E-238M-Q8_0.gguf"; + std::string vall_e_model_path = "./data/vall_e-q8_0.gguf"; + std::string encodec_model_path = "./data/encodec.bin"; + int32_t ngl = 0; + // load dynamic backends ggml_backend_load_all(); - // initialize the model - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = 0; + struct encodec_context* ectx = encodec_load_model(encodec_model_path.c_str(), 0, ngl); + if (!ectx) { + printf("%s: error during loading model\n", __func__); + return 1; + } + + encodec_set_target_bandwidth(ectx, 24); - llama_model* model = llama_load_model_from_file(model_path.c_str(), model_params); + // initialize the models + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = ngl; + + llama_model* model = llama_load_model_from_file(vall_e_model_path.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); return 1; @@ -358,6 +442,7 @@ int main(int argc, char ** argv) { llama_perf_context_print(ctx); fprintf(stderr, "\n"); + // encodec_free(ectx); llama_sampler_free(smpl); llama_free(ctx); llama_free_model(model);