vall_e.cpp phonemizing and tokenizing

2024-12-24 22:39:32 -06:00 · 2024-12-24 22:39:32 -06:00 · 6bf59bbd8b
commit 6bf59bbd8b
parent 8516bab15c
36 changed files with 16032 additions and 86 deletions
--- a/vall_e.cpp/Makefile
+++ b/vall_e.cpp/Makefile
@ -3,7 +3,7 @@ CXX 					= g++
 INCS 					+= -I./include
 LIBS 					+= -L./libs
 	
-LINKS 					+= -lggml -lggml-base -lllama -lencodec
+LINKS 					+= -lggml -lggml-base -lllama -lencodec -lespeak-ng
 FLAGS 					+= -march=native -O3

 SRCS 					:= $(shell find ./ -name "*.cpp")
--- a/vall_e.cpp/README.md
+++ b/vall_e.cpp/README.md
@ -8,7 +8,7 @@ At the moment it's ***very*** work in progress.

 Populate `./include/` with the `ggml`, `llama.cpp`, and `encodec.cpp` headers.

-Populate `./libs/` with the compiled libraries of `llama.cpp` and `encodec.cpp`.
+Populate `./libs/` with the compiled libraries of `llama.cpp`, `encodec.cpp`, and `espeak-ng`.

 Run `make`.

@ -26,10 +26,10 @@ Run `make`.
 	* [x] load the quantized model
 	* [x] orchestrate the required embeddings
 	* [x] juggle the output head / classifier properly
-* [ ] phonemize text
+* [x] phonemize text
 	* with the help of espeak-ng
-* [ ] tokenize phonemes
-	* the tokenizer is being a huge thorn on actual sequences
+* [x] tokenize phonemes
+	* tokenize with `llama_tokenize` instead of a homebrewed method because the tokenizer is being a huge thorn
 * [x] load audio from disk
 * [x] encode audio
 * [x] sum embeddings for the `prom` and prior `resp`s
--- a/vall_e.cpp/include/decoder.h
+++ b/vall_e.cpp/include/decoder.h
@ -0,0 +1,113 @@
+#pragma once
+
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#include "lstm.h"
+#include "utils.h"
+
+
+struct encodec_decoder_block {
+    // upsampling layers
+    struct ggml_tensor *us_conv_w;
+    struct ggml_tensor *us_conv_b;
+
+    // conv1
+    struct ggml_tensor *conv_1_w;
+    struct ggml_tensor *conv_1_b;
+
+    // conv2
+    struct ggml_tensor *conv_2_w;
+    struct ggml_tensor *conv_2_b;
+
+    // shortcut
+    struct ggml_tensor *conv_sc_w;
+    struct ggml_tensor *conv_sc_b;
+};
+
+struct encodec_decoder {
+    struct ggml_tensor *init_conv_w;
+    struct ggml_tensor *init_conv_b;
+
+    encodec_lstm lstm;
+
+    struct ggml_tensor *final_conv_w;
+    struct ggml_tensor *final_conv_b;
+
+    std::vector<encodec_decoder_block> blocks;
+};
+
+struct ggml_tensor *encodec_forward_decoder(
+    const struct encodec_decoder *decoder, struct ggml_context *ctx0,
+    struct ggml_tensor *quantized_out, const int *ratios, const int kernel_size, const int res_kernel_size,
+    const int stride) {
+
+    if (!quantized_out) {
+        fprintf(stderr, "%s: null input tensor\n", __func__);
+        return NULL;
+    }
+
+    struct ggml_tensor *inpL = strided_conv_1d(
+        ctx0, quantized_out, decoder->init_conv_w, decoder->init_conv_b, stride);
+
+    // lstm
+    {
+        struct ggml_tensor *cur = inpL;
+
+        const encodec_lstm lstm = decoder->lstm;
+
+        // first lstm layer
+        char l0_prefix[7] = "dec_l0";
+        struct ggml_tensor *hs1 = forward_pass_lstm_unilayer(
+            ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b, l0_prefix);
+
+        // second lstm layer
+        char l1_prefix[7] = "dec_l1";
+        struct ggml_tensor *out = forward_pass_lstm_unilayer(
+            ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b, l1_prefix);
+
+        inpL = ggml_add(ctx0, inpL, out);
+    }
+
+    for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
+        encodec_decoder_block block = decoder->blocks[layer_ix];
+
+        // upsampling layers
+        inpL = ggml_elu(ctx0, inpL);
+
+        inpL = strided_conv_transpose_1d(
+            ctx0, inpL, block.us_conv_w, block.us_conv_b, ratios[layer_ix]);
+
+        struct ggml_tensor *current = inpL;
+
+        // shortcut
+        struct ggml_tensor *shortcut = strided_conv_1d(
+            ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
+
+        // conv1
+        current = ggml_elu(ctx0, current);
+
+        current = strided_conv_1d(
+            ctx0, current, block.conv_1_w, block.conv_1_b, stride);
+
+        // conv2
+        current = ggml_elu(ctx0, current);
+
+        current = strided_conv_1d(
+            ctx0, current, block.conv_2_w, block.conv_2_b, stride);
+
+        // residual connection
+        inpL = ggml_add(ctx0, current, shortcut);
+    }
+
+    // final conv
+    inpL = ggml_elu(ctx0, inpL);
+
+    struct ggml_tensor *decoded_inp = strided_conv_1d(
+        ctx0, inpL, decoder->final_conv_w, decoder->final_conv_b, stride);
+
+    return decoded_inp;
+}
--- a/vall_e.cpp/include/dr_wav.h
+++ b/vall_e.cpp/include/dr_wav.h
--- a/vall_e.cpp/include/encodec.h
+++ b/vall_e.cpp/include/encodec.h
@ -0,0 +1,184 @@
+/*
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Pierre-Antoine Bannier                                        │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+/*
+ * This file contains the declarations of the structs and functions used in the encodec library.
+ * The library provides functionality for audio compression and decompression using a custom model.
+ * The model consists of an encoder, a quantizer and a decoder, each with their own set of parameters.
+ * The library also provides functions for loading and freeing the model, as well as compressing and decompressing audio data.
+ *
+ */
+#pragma once
+
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    struct encodec_context;
+
+    struct encodec_statistics {
+        // The time taken to load the model.
+        int64_t t_load_us;
+        // The time taken to compute the model.
+        int64_t t_compute_us;
+    };
+
+    /**
+     * Loads an encodec model from the specified file path.
+     *
+     * @param model_path The file path to the encodec model.
+     * @param offset The offset (in bytes) to the start of the model in the file.
+     * @param n_gpu_layers The number of GPU layers to use.
+     * @return A pointer to the encodec context struct.
+     */
+    struct encodec_context *encodec_load_model(
+        const char *model_path,
+        const int offset,
+        int n_gpu_layers);
+
+    /**
+     * Sets the target bandwidth for the given encodec context.
+     *
+     * @param ectx The encodec context to set the target bandwidth for.
+     * @param bandwidth The target bandwidth to set, in bits per second.
+     */
+    void encodec_set_target_bandwidth(
+        struct encodec_context *ectx,
+        int bandwidth);
+
+    /**
+     * Sets the sample rate for the given encodec context.
+     *
+     * @param ectx The encodec context to set the target bandwidth for.
+     * @param sample_rate The sample rate to set.
+     */
+    void encodec_set_sample_rate(
+        struct encodec_context *ectx,
+        int sample_rate);
+
+    /**
+     * Reconstructs audio from raw audio data using the specified encodec context.
+     *
+     * @param ectx The encodec context to use for reconstruction.
+     * @param raw_audio The raw audio data to reconstruct.
+     * @param n_samples The number of samples in the raw audio buffer.
+     * @param n_threads The number of threads to use for reconstruction.
+     * @return True if the reconstruction was successful, false otherwise.
+     */
+    bool encodec_reconstruct_audio(
+        struct encodec_context *ectx,
+        const float *raw_audio,
+        const int n_samples,
+        int n_threads);
+
+    /**
+     * Compresses audio data using the specified encodec context.
+     *
+     * @param ectx The encodec context to use for compression.
+     * @param raw_audio The raw audio data to compress.
+     * @param n_samples The number of samples in the raw audio buffer.
+     * @param n_threads The number of threads to use for compression.
+     * @return True if the compression was successful, false otherwise.
+     */
+    bool encodec_compress_audio(
+        struct encodec_context *ectx,
+        const float *raw_audio,
+        const int n_samples,
+        int n_threads);
+
+    /**
+     * Decompresses audio data using the specified encodec context.
+     *
+     * @param ectx The encodec context to use for decompression.
+     * @param codes The compressed audio data to decompress.
+     * @param n_codes The number of codes in the codes buffer.
+     * @param n_threads The number of threads to use for decompression.
+     * @return True if the audio data was successfully decompressed, false otherwise.
+     */
+    bool encodec_decompress_audio(
+        struct encodec_context *ectx,
+        const int32_t *codes,
+        const int n_codes,
+        int n_threads);
+
+    /**
+     * Gets the audio data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the audio data from.
+     * @return A pointer to the audio data.
+    */
+    float * encodec_get_audio(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the size of the audio data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the audio size from.
+     * @return The size of the audio data.
+    */
+    int encodec_get_audio_size(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the code data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the code data from.
+     * @return A pointer to the code data.
+    */
+    int32_t * encodec_get_codes(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the size of the code data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the code size from.
+     * @return The size of the code data.
+    */
+    int encodec_get_codes_size(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the statistics for the given encodec context.
+     *
+     * @param ectx The encodec context to get the statistics for.
+     * @return A pointer to the statistics struct.
+    */
+    const struct encodec_statistics* encodec_get_statistics(
+        struct encodec_context *ectx);
+
+    /**
+     * Reset the statistics for the given encodec context.
+     *
+     * @param ectx The encodec context to reset the statistics for.
+    */
+   void encodec_reset_statistics(
+        struct encodec_context *ectx);
+
+    /**
+     * @brief Frees the memory allocated for an encodec context.
+     *
+     * @param ectx The encodec context to free.
+     */
+    void encodec_free(
+        struct encodec_context *ectx);
+
+#ifdef __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/encoder.h
+++ b/vall_e.cpp/include/encoder.h
@ -0,0 +1,109 @@
+#pragma once
+
+#include <vector>
+
+#include "ggml.h"
+#include "lstm.h"
+
+// res + downsample block at some ratio
+struct encodec_encoder_block {
+    // conv1
+    struct ggml_tensor *conv_1_w;
+    struct ggml_tensor *conv_1_b;
+
+    // conv2
+    struct ggml_tensor *conv_2_w;
+    struct ggml_tensor *conv_2_b;
+
+    // shortcut
+    struct ggml_tensor *conv_sc_w;
+    struct ggml_tensor *conv_sc_b;
+
+    // downsampling layers
+    struct ggml_tensor *ds_conv_w;
+    struct ggml_tensor *ds_conv_b;
+};
+
+struct encodec_encoder {
+    struct ggml_tensor *init_conv_w;
+    struct ggml_tensor *init_conv_b;
+
+    encodec_lstm lstm;
+
+    struct ggml_tensor *final_conv_w;
+    struct ggml_tensor *final_conv_b;
+
+    std::vector<encodec_encoder_block> blocks;
+};
+
+struct ggml_tensor *encodec_forward_encoder(
+    const struct encodec_encoder *encoder, struct ggml_context *ctx0,
+    struct ggml_tensor *inp, const int * ratios, const int kernel_size, const int res_kernel_size,
+    const int stride) {
+
+    if (!inp) {
+        fprintf(stderr, "%s: null input tensor\n", __func__);
+        return NULL;
+    }
+
+    struct ggml_tensor *inpL = strided_conv_1d(
+        ctx0, inp, encoder->init_conv_w, encoder->init_conv_b, stride);
+
+    for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
+        encodec_encoder_block block = encoder->blocks[layer_ix];
+
+        struct ggml_tensor *current = inpL;
+
+        // shortcut
+        struct ggml_tensor *shortcut = strided_conv_1d(
+            ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
+
+        // conv1
+        current = ggml_elu(ctx0, current);
+
+        current = strided_conv_1d(
+            ctx0, current, block.conv_1_w, block.conv_1_b, stride);