vall_e.cpp phonemizing and tokenizing

2024-12-24 22:39:32 -06:00 · 2024-12-24 22:39:32 -06:00 · 6bf59bbd8b
commit 6bf59bbd8b
parent 8516bab15c
36 changed files with 16032 additions and 86 deletions
--- a/vall_e.cpp/Makefile
+++ b/vall_e.cpp/Makefile
@ -3,7 +3,7 @@ CXX 					= g++
 INCS 					+= -I./include
 LIBS 					+= -L./libs
 	
-LINKS 					+= -lggml -lggml-base -lllama -lencodec
+LINKS 					+= -lggml -lggml-base -lllama -lencodec -lespeak-ng
 FLAGS 					+= -march=native -O3

 SRCS 					:= $(shell find ./ -name "*.cpp")
--- a/vall_e.cpp/README.md
+++ b/vall_e.cpp/README.md
@ -8,7 +8,7 @@ At the moment it's ***very*** work in progress.

 Populate `./include/` with the `ggml`, `llama.cpp`, and `encodec.cpp` headers.

-Populate `./libs/` with the compiled libraries of `llama.cpp` and `encodec.cpp`.
+Populate `./libs/` with the compiled libraries of `llama.cpp`, `encodec.cpp`, and `espeak-ng`.

 Run `make`.

@ -26,10 +26,10 @@ Run `make`.
 	* [x] load the quantized model
 	* [x] orchestrate the required embeddings
 	* [x] juggle the output head / classifier properly
-* [ ] phonemize text
+* [x] phonemize text
 	* with the help of espeak-ng
-* [ ] tokenize phonemes
-	* the tokenizer is being a huge thorn on actual sequences
+* [x] tokenize phonemes
+	* tokenize with `llama_tokenize` instead of a homebrewed method because the tokenizer is being a huge thorn
 * [x] load audio from disk
 * [x] encode audio
 * [x] sum embeddings for the `prom` and prior `resp`s
--- a/vall_e.cpp/include/decoder.h
+++ b/vall_e.cpp/include/decoder.h
@ -0,0 +1,113 @@
+#pragma once
+
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#include "lstm.h"
+#include "utils.h"
+
+
+struct encodec_decoder_block {
+    // upsampling layers
+    struct ggml_tensor *us_conv_w;
+    struct ggml_tensor *us_conv_b;
+
+    // conv1
+    struct ggml_tensor *conv_1_w;
+    struct ggml_tensor *conv_1_b;
+
+    // conv2
+    struct ggml_tensor *conv_2_w;
+    struct ggml_tensor *conv_2_b;
+
+    // shortcut
+    struct ggml_tensor *conv_sc_w;
+    struct ggml_tensor *conv_sc_b;
+};
+
+struct encodec_decoder {
+    struct ggml_tensor *init_conv_w;
+    struct ggml_tensor *init_conv_b;
+
+    encodec_lstm lstm;
+
+    struct ggml_tensor *final_conv_w;
+    struct ggml_tensor *final_conv_b;
+
+    std::vector<encodec_decoder_block> blocks;
+};
+
+struct ggml_tensor *encodec_forward_decoder(
+    const struct encodec_decoder *decoder, struct ggml_context *ctx0,
+    struct ggml_tensor *quantized_out, const int *ratios, const int kernel_size, const int res_kernel_size,
+    const int stride) {
+
+    if (!quantized_out) {
+        fprintf(stderr, "%s: null input tensor\n", __func__);
+        return NULL;
+    }
+
+    struct ggml_tensor *inpL = strided_conv_1d(
+        ctx0, quantized_out, decoder->init_conv_w, decoder->init_conv_b, stride);
+
+    // lstm
+    {
+        struct ggml_tensor *cur = inpL;
+
+        const encodec_lstm lstm = decoder->lstm;
+
+        // first lstm layer
+        char l0_prefix[7] = "dec_l0";
+        struct ggml_tensor *hs1 = forward_pass_lstm_unilayer(
+            ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b, l0_prefix);
+
+        // second lstm layer
+        char l1_prefix[7] = "dec_l1";
+        struct ggml_tensor *out = forward_pass_lstm_unilayer(
+            ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b, l1_prefix);
+
+        inpL = ggml_add(ctx0, inpL, out);
+    }
+
+    for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
+        encodec_decoder_block block = decoder->blocks[layer_ix];
+
+        // upsampling layers
+        inpL = ggml_elu(ctx0, inpL);
+
+        inpL = strided_conv_transpose_1d(
+            ctx0, inpL, block.us_conv_w, block.us_conv_b, ratios[layer_ix]);
+
+        struct ggml_tensor *current = inpL;
+
+        // shortcut
+        struct ggml_tensor *shortcut = strided_conv_1d(
+            ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
+
+        // conv1
+        current = ggml_elu(ctx0, current);
+
+        current = strided_conv_1d(
+            ctx0, current, block.conv_1_w, block.conv_1_b, stride);
+
+        // conv2
+        current = ggml_elu(ctx0, current);
+
+        current = strided_conv_1d(
+            ctx0, current, block.conv_2_w, block.conv_2_b, stride);
+
+        // residual connection
+        inpL = ggml_add(ctx0, current, shortcut);
+    }
+
+    // final conv
+    inpL = ggml_elu(ctx0, inpL);
+
+    struct ggml_tensor *decoded_inp = strided_conv_1d(
+        ctx0, inpL, decoder->final_conv_w, decoder->final_conv_b, stride);
+
+    return decoded_inp;
+}
--- a/vall_e.cpp/include/dr_wav.h
+++ b/vall_e.cpp/include/dr_wav.h
--- a/vall_e.cpp/include/encodec.h
+++ b/vall_e.cpp/include/encodec.h
@ -0,0 +1,184 @@
+/*
+╞══════════════════════════════════════════════════════════════════════════════╡
+│ Copyright 2024 Pierre-Antoine Bannier                                        │
+│                                                                              │
+│ Permission to use, copy, modify, and/or distribute this software for         │
+│ any purpose with or without fee is hereby granted, provided that the         │
+│ above copyright notice and this permission notice appear in all copies.      │
+│                                                                              │
+│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
+│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
+│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
+│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
+│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
+│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
+│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
+│ PERFORMANCE OF THIS SOFTWARE.                                                │
+╚─────────────────────────────────────────────────────────────────────────────*/
+/*
+ * This file contains the declarations of the structs and functions used in the encodec library.
+ * The library provides functionality for audio compression and decompression using a custom model.
+ * The model consists of an encoder, a quantizer and a decoder, each with their own set of parameters.
+ * The library also provides functions for loading and freeing the model, as well as compressing and decompressing audio data.
+ *
+ */
+#pragma once
+
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    struct encodec_context;
+
+    struct encodec_statistics {
+        // The time taken to load the model.
+        int64_t t_load_us;
+        // The time taken to compute the model.
+        int64_t t_compute_us;
+    };
+
+    /**
+     * Loads an encodec model from the specified file path.
+     *
+     * @param model_path The file path to the encodec model.
+     * @param offset The offset (in bytes) to the start of the model in the file.
+     * @param n_gpu_layers The number of GPU layers to use.
+     * @return A pointer to the encodec context struct.
+     */
+    struct encodec_context *encodec_load_model(
+        const char *model_path,
+        const int offset,
+        int n_gpu_layers);
+
+    /**
+     * Sets the target bandwidth for the given encodec context.
+     *
+     * @param ectx The encodec context to set the target bandwidth for.
+     * @param bandwidth The target bandwidth to set, in bits per second.
+     */
+    void encodec_set_target_bandwidth(
+        struct encodec_context *ectx,
+        int bandwidth);
+
+    /**
+     * Sets the sample rate for the given encodec context.
+     *
+     * @param ectx The encodec context to set the target bandwidth for.
+     * @param sample_rate The sample rate to set.
+     */
+    void encodec_set_sample_rate(
+        struct encodec_context *ectx,
+        int sample_rate);
+
+    /**
+     * Reconstructs audio from raw audio data using the specified encodec context.
+     *
+     * @param ectx The encodec context to use for reconstruction.
+     * @param raw_audio The raw audio data to reconstruct.
+     * @param n_samples The number of samples in the raw audio buffer.
+     * @param n_threads The number of threads to use for reconstruction.
+     * @return True if the reconstruction was successful, false otherwise.
+     */
+    bool encodec_reconstruct_audio(
+        struct encodec_context *ectx,
+        const float *raw_audio,
+        const int n_samples,
+        int n_threads);
+
+    /**
+     * Compresses audio data using the specified encodec context.
+     *
+     * @param ectx The encodec context to use for compression.
+     * @param raw_audio The raw audio data to compress.
+     * @param n_samples The number of samples in the raw audio buffer.
+     * @param n_threads The number of threads to use for compression.
+     * @return True if the compression was successful, false otherwise.
+     */
+    bool encodec_compress_audio(
+        struct encodec_context *ectx,
+        const float *raw_audio,
+        const int n_samples,
+        int n_threads);
+
+    /**
+     * Decompresses audio data using the specified encodec context.
+     *
+     * @param ectx The encodec context to use for decompression.
+     * @param codes The compressed audio data to decompress.
+     * @param n_codes The number of codes in the codes buffer.
+     * @param n_threads The number of threads to use for decompression.
+     * @return True if the audio data was successfully decompressed, false otherwise.
+     */
+    bool encodec_decompress_audio(
+        struct encodec_context *ectx,
+        const int32_t *codes,
+        const int n_codes,
+        int n_threads);
+
+    /**
+     * Gets the audio data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the audio data from.
+     * @return A pointer to the audio data.
+    */
+    float * encodec_get_audio(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the size of the audio data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the audio size from.
+     * @return The size of the audio data.
+    */
+    int encodec_get_audio_size(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the code data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the code data from.
+     * @return A pointer to the code data.
+    */
+    int32_t * encodec_get_codes(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the size of the code data from the given encodec context.
+     *
+     * @param ectx The encodec context to get the code size from.
+     * @return The size of the code data.
+    */
+    int encodec_get_codes_size(
+        struct encodec_context *ectx);
+
+    /**
+     * Gets the statistics for the given encodec context.
+     *
+     * @param ectx The encodec context to get the statistics for.
+     * @return A pointer to the statistics struct.
+    */
+    const struct encodec_statistics* encodec_get_statistics(
+        struct encodec_context *ectx);
+
+    /**
+     * Reset the statistics for the given encodec context.
+     *
+     * @param ectx The encodec context to reset the statistics for.
+    */
+   void encodec_reset_statistics(
+        struct encodec_context *ectx);
+
+    /**
+     * @brief Frees the memory allocated for an encodec context.
+     *
+     * @param ectx The encodec context to free.
+     */
+    void encodec_free(
+        struct encodec_context *ectx);
+
+#ifdef __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/encoder.h
+++ b/vall_e.cpp/include/encoder.h
@ -0,0 +1,109 @@
+#pragma once
+
+#include <vector>
+
+#include "ggml.h"
+#include "lstm.h"
+
+// res + downsample block at some ratio
+struct encodec_encoder_block {
+    // conv1
+    struct ggml_tensor *conv_1_w;
+    struct ggml_tensor *conv_1_b;
+
+    // conv2
+    struct ggml_tensor *conv_2_w;
+    struct ggml_tensor *conv_2_b;
+
+    // shortcut
+    struct ggml_tensor *conv_sc_w;
+    struct ggml_tensor *conv_sc_b;
+
+    // downsampling layers
+    struct ggml_tensor *ds_conv_w;
+    struct ggml_tensor *ds_conv_b;
+};
+
+struct encodec_encoder {
+    struct ggml_tensor *init_conv_w;
+    struct ggml_tensor *init_conv_b;
+
+    encodec_lstm lstm;
+
+    struct ggml_tensor *final_conv_w;
+    struct ggml_tensor *final_conv_b;
+
+    std::vector<encodec_encoder_block> blocks;
+};
+
+struct ggml_tensor *encodec_forward_encoder(
+    const struct encodec_encoder *encoder, struct ggml_context *ctx0,
+    struct ggml_tensor *inp, const int * ratios, const int kernel_size, const int res_kernel_size,
+    const int stride) {
+
+    if (!inp) {
+        fprintf(stderr, "%s: null input tensor\n", __func__);
+        return NULL;
+    }
+
+    struct ggml_tensor *inpL = strided_conv_1d(
+        ctx0, inp, encoder->init_conv_w, encoder->init_conv_b, stride);
+
+    for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
+        encodec_encoder_block block = encoder->blocks[layer_ix];
+
+        struct ggml_tensor *current = inpL;
+
+        // shortcut
+        struct ggml_tensor *shortcut = strided_conv_1d(
+            ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
+
+        // conv1
+        current = ggml_elu(ctx0, current);
+
+        current = strided_conv_1d(
+            ctx0, current, block.conv_1_w, block.conv_1_b, stride);
+
+        // conv2
+        current = ggml_elu(ctx0, current);
+
+        current = strided_conv_1d(
+            ctx0, current, block.conv_2_w, block.conv_2_b, stride);
+
+        // residual connection
+        inpL = ggml_add(ctx0, current, shortcut);
+
+        // downsampling layers
+        inpL = ggml_elu(ctx0, inpL);
+
+        inpL = strided_conv_1d(
+            ctx0, inpL, block.ds_conv_w, block.ds_conv_b, ratios[3 - layer_ix]);
+    }
+
+    // lstm
+    {
+        struct ggml_tensor *cur = inpL;
+
+        const encodec_lstm lstm = encoder->lstm;
+
+        // first lstm layer
+        char l0_prefix[7] = "enc_l0";
+        struct ggml_tensor *hs1 = forward_pass_lstm_unilayer(
+            ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b, l0_prefix);
+
+        // second lstm layer
+        char l1_prefix[7] = "enc_l1";
+        struct ggml_tensor *out = forward_pass_lstm_unilayer(
+            ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b, l1_prefix);
+
+        inpL = ggml_add(ctx0, inpL, out);
+    }
+
+    // final conv
+    inpL = ggml_elu(ctx0, inpL);
+
+    struct ggml_tensor *encoded_inp = strided_conv_1d(
+        ctx0, inpL, encoder->final_conv_w, encoder->final_conv_b, stride);
+
+    return encoded_inp;
+}
--- a/vall_e.cpp/include/espeak-ng/encoding.h
+++ b/vall_e.cpp/include/espeak-ng/encoding.h
@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2017 Reece H. Dunn
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see: <http://www.gnu.org/licenses/>.
+ */
+#ifndef ESPEAK_NG_ENCODING_H
+#define ESPEAK_NG_ENCODING_H
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef enum
+{
+	ESPEAKNG_ENCODING_UNKNOWN,
+	ESPEAKNG_ENCODING_US_ASCII,
+	ESPEAKNG_ENCODING_ISO_8859_1,
+	ESPEAKNG_ENCODING_ISO_8859_2,
+	ESPEAKNG_ENCODING_ISO_8859_3,
+	ESPEAKNG_ENCODING_ISO_8859_4,
+	ESPEAKNG_ENCODING_ISO_8859_5,
+	ESPEAKNG_ENCODING_ISO_8859_6,
+	ESPEAKNG_ENCODING_ISO_8859_7,
+	ESPEAKNG_ENCODING_ISO_8859_8,
+	ESPEAKNG_ENCODING_ISO_8859_9,
+	ESPEAKNG_ENCODING_ISO_8859_10,
+	ESPEAKNG_ENCODING_ISO_8859_11,
+	// ISO-8859-12 is not a valid encoding.
+	ESPEAKNG_ENCODING_ISO_8859_13,
+	ESPEAKNG_ENCODING_ISO_8859_14,
+	ESPEAKNG_ENCODING_ISO_8859_15,
+	ESPEAKNG_ENCODING_ISO_8859_16,
+	ESPEAKNG_ENCODING_KOI8_R,
+	ESPEAKNG_ENCODING_ISCII,
+	ESPEAKNG_ENCODING_UTF_8,
+	ESPEAKNG_ENCODING_ISO_10646_UCS_2,
+} espeak_ng_ENCODING;
+
+ESPEAK_NG_API espeak_ng_ENCODING
+espeak_ng_EncodingFromName(const char *encoding);
+
+typedef struct espeak_ng_TEXT_DECODER_ espeak_ng_TEXT_DECODER;
+
+ESPEAK_NG_API espeak_ng_TEXT_DECODER *
+create_text_decoder(void);
+
+ESPEAK_NG_API void
+destroy_text_decoder(espeak_ng_TEXT_DECODER *decoder);
+
+ESPEAK_NG_API espeak_ng_STATUS
+text_decoder_decode_string(espeak_ng_TEXT_DECODER *decoder,
+                           const char *string,
+                           int length,
+                           espeak_ng_ENCODING encoding);
+
+ESPEAK_NG_API espeak_ng_STATUS
+text_decoder_decode_string_auto(espeak_ng_TEXT_DECODER *decoder,
+                                const char *string,
+                                int length,
+                                espeak_ng_ENCODING encoding);
+
+ESPEAK_NG_API espeak_ng_STATUS
+text_decoder_decode_wstring(espeak_ng_TEXT_DECODER *decoder,
+                            const wchar_t *string,
+                            int length);
+
+ESPEAK_NG_API espeak_ng_STATUS
+text_decoder_decode_string_multibyte(espeak_ng_TEXT_DECODER *decoder,
+                                     const void *input,
+                                     espeak_ng_ENCODING encoding,
+                                     int flags);
+
+ESPEAK_NG_API int
+text_decoder_eof(espeak_ng_TEXT_DECODER *decoder);
+
+ESPEAK_NG_API uint32_t
+text_decoder_getc(espeak_ng_TEXT_DECODER *decoder);
+
+ESPEAK_NG_API uint32_t
+text_decoder_peekc(espeak_ng_TEXT_DECODER *decoder);
+
+ESPEAK_NG_API const void *
+text_decoder_get_buffer(espeak_ng_TEXT_DECODER *decoder);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/vall_e.cpp/include/espeak-ng/espeak_ng.h
+++ b/vall_e.cpp/include/espeak-ng/espeak_ng.h
@ -0,0 +1,223 @@
+/* eSpeak NG API.
+ *
+ * Copyright (C) 2015-2017 Reece H. Dunn
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ESPEAK_NG_H
+#define ESPEAK_NG_H
+
+#include <espeak-ng/speak_lib.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+#ifdef LIBESPEAK_NG_EXPORT
+#define ESPEAK_NG_API __declspec(dllexport)
+#else
+#define ESPEAK_NG_API __declspec(dllimport)
+#endif
+#else
+#define ESPEAK_NG_API
+#endif
+
+#define ESPEAKNG_DEFAULT_VOICE "en"
+
+typedef enum {
+	ENS_GROUP_MASK               = 0x70000000,
+	ENS_GROUP_ERRNO              = 0x00000000, /* Values 0-255 map to errno error codes. */
+	ENS_GROUP_ESPEAK_NG          = 0x10000000, /* eSpeak NG error codes. */
+
+	/* eSpeak NG 1.49.0 */
+	ENS_OK                       = 0,
+	ENS_COMPILE_ERROR            = 0x100001FF,
+	ENS_VERSION_MISMATCH         = 0x100002FF,
+	ENS_FIFO_BUFFER_FULL         = 0x100003FF,
+	ENS_NOT_INITIALIZED          = 0x100004FF,
+	ENS_AUDIO_ERROR              = 0x100005FF,
+	ENS_VOICE_NOT_FOUND          = 0x100006FF,
+	ENS_MBROLA_NOT_FOUND         = 0x100007FF,
+	ENS_MBROLA_VOICE_NOT_FOUND   = 0x100008FF,
+	ENS_EVENT_BUFFER_FULL        = 0x100009FF,
+	ENS_NOT_SUPPORTED            = 0x10000AFF,
+	ENS_UNSUPPORTED_PHON_FORMAT  = 0x10000BFF,
+	ENS_NO_SPECT_FRAMES          = 0x10000CFF,
+	ENS_EMPTY_PHONEME_MANIFEST   = 0x10000DFF,
+	ENS_SPEECH_STOPPED           = 0x10000EFF,
+
+	/* eSpeak NG 1.49.2 */
+	ENS_UNKNOWN_PHONEME_FEATURE  = 0x10000FFF,
+	ENS_UNKNOWN_TEXT_ENCODING    = 0x100010FF,
+} espeak_ng_STATUS;
+
+typedef enum {
+	ENOUTPUT_MODE_SYNCHRONOUS = 0x0001,
+	ENOUTPUT_MODE_SPEAK_AUDIO = 0x0002,
+} espeak_ng_OUTPUT_MODE;
+
+typedef enum {
+	ENGENDER_UNKNOWN = 0,
+	ENGENDER_MALE = 1,
+	ENGENDER_FEMALE = 2,
+	ENGENDER_NEUTRAL = 3,
+} espeak_ng_VOICE_GENDER;
+
+typedef struct
+{
+  void (*outputPhoSymbol)(char* pho_code,int pho_type);
+  void (*outputSilence)(short echo_tail);
+  void (*outputVoiced)(short sample);
+  void (*outputUnvoiced)(short sample);
+} espeak_ng_OUTPUT_HOOKS;
+
+/* eSpeak NG 1.49.0 */
+
+typedef struct espeak_ng_ERROR_CONTEXT_ *espeak_ng_ERROR_CONTEXT;
+
+ESPEAK_NG_API void
+espeak_ng_ClearErrorContext(espeak_ng_ERROR_CONTEXT *context);
+
+ESPEAK_NG_API void
+espeak_ng_GetStatusCodeMessage(espeak_ng_STATUS status,
+                               char *buffer,
+                               size_t length);
+
+ESPEAK_NG_API void
+espeak_ng_PrintStatusCodeMessage(espeak_ng_STATUS status,
+                                 FILE *out,
+                                 espeak_ng_ERROR_CONTEXT context);
+
+ESPEAK_NG_API void
+espeak_ng_InitializePath(const char *path);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_Initialize(espeak_ng_ERROR_CONTEXT *context);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_InitializeOutput(espeak_ng_OUTPUT_MODE output_mode,
+                           int buffer_length,
+                           const char *device);
+
+ESPEAK_NG_API int
+espeak_ng_GetSampleRate(void);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetParameter(espeak_PARAMETER parameter,
+                       int value,
+                       int relative);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetPhonemeEvents(int enable, int ipa);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetPunctuationList(const wchar_t *punctlist);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetVoiceByName(const char *name);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetVoiceByFile(const char *filename);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetVoiceByProperties(espeak_VOICE *voice_selector);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_Synthesize(const void *text,
+                     size_t size,
+                     unsigned int position,
+                     espeak_POSITION_TYPE position_type,
+                     unsigned int end_position,
+                     unsigned int flags,
+                     unsigned int *unique_identifier,
+                     void *user_data);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SynthesizeMark(const void *text,
+                         size_t size,
+                         const char *index_mark,
+                         unsigned int end_position,
+                         unsigned int flags,
+                         unsigned int *unique_identifier,
+                         void *user_data);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SpeakKeyName(const char *key_name);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SpeakCharacter(wchar_t character);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_Cancel(void);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_Synchronize(void);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_Terminate(void);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_CompileDictionary(const char *dsource,
+                            const char *dict_name,
+                            FILE *log,
+                            int flags,
+                            espeak_ng_ERROR_CONTEXT *context);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_CompileMbrolaVoice(const char *path,
+                             FILE *log,
+                             espeak_ng_ERROR_CONTEXT *context);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_CompilePhonemeData(long rate,
+                             FILE *log,
+                             espeak_ng_ERROR_CONTEXT *context);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_CompileIntonation(FILE *log,
+                            espeak_ng_ERROR_CONTEXT *context);
+
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_CompileIntonationPath(const char *source_path,
+                                const char *destination_path,
+                                FILE *log,
+                                espeak_ng_ERROR_CONTEXT *context);
+
+/* eSpeak NG 1.49.1 */
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_CompilePhonemeDataPath(long rate,
+                                 const char *source_path,
+                                 const char *destination_path,
+                                 FILE *log,
+                                 espeak_ng_ERROR_CONTEXT *context);
+                                 
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetOutputHooks(espeak_ng_OUTPUT_HOOKS* hooks);
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetConstF0(int f0);
+
+ESPEAK_NG_API espeak_ng_STATUS
+espeak_ng_SetRandSeed(long seed);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/vall_e.cpp/include/espeak-ng/speak_lib.h
+++ b/vall_e.cpp/include/espeak-ng/speak_lib.h
@ -0,0 +1,709 @@
+#ifndef SPEAK_LIB_H
+#define SPEAK_LIB_H
+/***************************************************************************
+ *   Copyright (C) 2005 to 2012 by Jonathan Duddington                     *
+ *   email: jonsd@users.sourceforge.net                                    *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 3 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, see:                                 *
+ *               <http://www.gnu.org/licenses/>.                           *
+ ***************************************************************************/
+
+
+/*************************************************************/
+/* This is the header file for the library version of espeak */
+/*                                                           */
+/*************************************************************/
+
+#include <stdio.h>
+#include <stddef.h>
+
+#if defined(_WIN32) || defined(_WIN64)
+#ifdef LIBESPEAK_NG_EXPORT
+#define ESPEAK_API __declspec(dllexport)
+#else
+#define ESPEAK_API __declspec(dllimport)
+#endif
+#else
+#define ESPEAK_API
+#endif
+
+#define ESPEAK_API_REVISION  12
+/*
+Revision 2
+   Added parameter "options" to eSpeakInitialize()
+
+Revision 3
+   Added espeakWORDGAP to  espeak_PARAMETER
+
+Revision 4
+   Added flags parameter to espeak_CompileDictionary()
+
+Revision 5
+   Added espeakCHARS_16BIT
+
+Revision 6
+  Added macros: espeakRATE_MINIMUM, espeakRATE_MAXIMUM, espeakRATE_NORMAL
+
+Revision 7  24.Dec.2011
+  Changed espeak_EVENT structure to add id.string[] for phoneme mnemonics.
+  Added espeakINITIALIZE_PHONEME_IPA option for espeak_Initialize() to report phonemes as IPA names.
+
+Revision 8  26.Apr.2013
+  Added function espeak_TextToPhonemes().
+
+Revision 9  30.May.2013
+  Changed function espeak_TextToPhonemes().
+
+Revision 10 29.Aug.2014
+  Changed phonememode parameter to espeak_TextToPhonemes() and espeak_SetPhonemeTrace
+
+Revision 11 (espeak-ng)
+  Made ESPEAK_API import/export symbols correctly on Windows.
+
+Revision 12 (espeak-ng)
+  Exposed espeak_SetPhonemeCallback. This is available in eSpeak, but was not exposed in this header.
+
+*/
+         /********************/
+         /*  Initialization  */
+         /********************/
+
+// values for 'value' in espeak_SetParameter(espeakRATE, value, 0), nominally in words-per-minute
+#define espeakRATE_MINIMUM  80
+#define espeakRATE_MAXIMUM  450
+#define espeakRATE_NORMAL   175
+
+
+typedef enum {
+  espeakEVENT_LIST_TERMINATED = 0, // Retrieval mode: terminates the event list.
+  espeakEVENT_WORD = 1,            // Start of word
+  espeakEVENT_SENTENCE = 2,        // Start of sentence
+  espeakEVENT_MARK = 3,            // Mark
+  espeakEVENT_PLAY = 4,            // Audio element
+  espeakEVENT_END = 5,             // End of sentence or clause
+  espeakEVENT_MSG_TERMINATED = 6,  // End of message
+  espeakEVENT_PHONEME = 7,         // Phoneme, if enabled in espeak_Initialize()
+  espeakEVENT_SAMPLERATE = 8       // Set sample rate
+} espeak_EVENT_TYPE;
+
+
+
+typedef struct {
+	espeak_EVENT_TYPE type;
+	unsigned int unique_identifier; // message identifier (or 0 for key or character)
+	int text_position;    // the number of characters from the start of the text
+	int length;           // word length, in characters (for espeakEVENT_WORD)
+	int audio_position;   // the time in mS within the generated speech output data
+	int sample;           // sample id (internal use)
+	void* user_data;      // pointer supplied by the calling program
+	union {
+		int number;        // used for WORD and SENTENCE events.
+		const char *name;  // used for MARK and PLAY events.  UTF8 string
+		char string[8];    // used for phoneme names (UTF8). Terminated by a zero byte unless the name needs the full 8 bytes.
+	} id;
+} espeak_EVENT;
+/*
+   When a message is supplied to espeak_synth, the request is buffered and espeak_synth returns. When the message is really processed, the callback function will be repetedly called.
+
+
+   In RETRIEVAL mode, the callback function supplies to the calling program the audio data and an event list terminated by 0 (LIST_TERMINATED).
+
+   In PLAYBACK mode, the callback function is called as soon as an event happens.
+
+   For example suppose that the following message is supplied to espeak_Synth:
+   "hello, hello."
+
+
+   * Once processed in RETRIEVAL mode, it could lead to 3 calls of the callback function :
+
+   ** Block 1:
+   <audio data> +
+   List of events: SENTENCE + WORD + LIST_TERMINATED
+
+   ** Block 2:
+   <audio data> +
+   List of events: WORD + END + LIST_TERMINATED
+
+   ** Block 3:
+   no audio data
+   List of events: MSG_TERMINATED + LIST_TERMINATED
+
+
+   * Once processed in PLAYBACK mode, it could lead to 5 calls of the callback function:
+
+   ** SENTENCE
+   ** WORD (call when the sounds are actually played)
+   ** WORD
+   ** END (call when the end of sentence is actually played.)
+   ** MSG_TERMINATED
+
+
+   The MSG_TERMINATED event is the last event. It can inform the calling program to clear the user data related to the message.
+   So if the synthesis must be stopped, the callback function is called for each pending message with the MSG_TERMINATED event.
+
+   A MARK event indicates a <mark> element in the text.
+   A PLAY event indicates an <audio> element in the text, for which the calling program should play the named sound file.
+*/
+
+
+
+typedef enum {
+	POS_CHARACTER = 1,
+	POS_WORD,
+	POS_SENTENCE
+} espeak_POSITION_TYPE;
+
+
+typedef enum {
+	/* PLAYBACK mode: plays the audio data, supplies events to the calling program*/
+	AUDIO_OUTPUT_PLAYBACK,
+
+	/* RETRIEVAL mode: supplies audio data and events to the calling program */
+	AUDIO_OUTPUT_RETRIEVAL,
+
+	/* SYNCHRONOUS mode: as RETRIEVAL but doesn't return until synthesis is completed */
+	AUDIO_OUTPUT_SYNCHRONOUS,
+
+	/* Synchronous playback */
+	AUDIO_OUTPUT_SYNCH_PLAYBACK
+
+} espeak_AUDIO_OUTPUT;
+
+
+typedef enum {
+	EE_OK=0,
+	EE_INTERNAL_ERROR=-1,
+	EE_BUFFER_FULL=1,
+	EE_NOT_FOUND=2
+} espeak_ERROR;
+
+#define espeakINITIALIZE_PHONEME_EVENTS 0x0001
+#define espeakINITIALIZE_PHONEME_IPA   0x0002
+#define espeakINITIALIZE_DONT_EXIT     0x8000
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API int espeak_Initialize(espeak_AUDIO_OUTPUT output, int buflength, const char *path, int options);
+/* Must be called before any synthesis functions are called.
+   output: the audio data can either be played by eSpeak or passed back by the SynthCallback function.
+
+   buflength:  The length in mS of sound buffers passed to the SynthCallback function.
+            Value=0 gives a default of 60mS.
+            This parameter is only used for AUDIO_OUTPUT_RETRIEVAL and AUDIO_OUTPUT_SYNCHRONOUS modes.
+
+   path: The directory which contains the espeak-ng-data directory, or NULL for the default location.
+
+   options: bit 0:  1=allow espeakEVENT_PHONEME events.
+            bit 1:  1= espeakEVENT_PHONEME events give IPA phoneme names, not eSpeak phoneme names
+            bit 15: 1=don't exit if espeak_data is not found (used for --help)
+
+   Returns: sample rate in Hz, or -1 (EE_INTERNAL_ERROR).
+*/
+
+typedef int (t_espeak_callback)(short*, int, espeak_EVENT*);
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API void espeak_SetSynthCallback(t_espeak_callback* SynthCallback);
+/* Must be called before any synthesis functions are called.
+   This specifies a function in the calling program which is called when a buffer of
+   speech sound data has been produced.
+
+
+   The callback function is of the form:
+
+int SynthCallback(short *wav, int numsamples, espeak_EVENT *events);
+
+   wav:  is the speech sound data which has been produced.
+      NULL indicates that the synthesis has been completed.
+
+   numsamples: is the number of entries in wav.  This number may vary, may be less than
+      the value implied by the buflength parameter given in espeak_Initialize, and may
+      sometimes be zero (which does NOT indicate end of synthesis).
+
+   events: an array of espeak_EVENT items which indicate word and sentence events, and
+      also the occurrence if <mark> and <audio> elements within the text.  The list of
+      events is terminated by an event of type = 0.
+
+
+   Callback returns: 0=continue synthesis,  1=abort synthesis.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API void espeak_SetUriCallback(int (*UriCallback)(int, const char*, const char*));
+/* This function may be called before synthesis functions are used, in order to deal with
+   <audio> tags.  It specifies a callback function which is called when an <audio> element is
+   encountered and allows the calling program to indicate whether the sound file which
+   is specified in the <audio> element is available and is to be played.
+
+   The callback function is of the form:
+
+int UriCallback(int type, const char *uri, const char *base);
+
+   type:  type of callback event.  Currently only 1= <audio> element
+
+   uri:   the "src" attribute from the <audio> element
+
+   base:  the "xml:base" attribute (if any) from the <speak> element
+
+   Return: 1=don't play the sound, but speak the text alternative.
+           0=place a PLAY event in the event list at the point where the <audio> element
+             occurs.  The calling program can then play the sound at that point.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API void espeak_SetPhonemeCallback(int (*PhonemeCallback)(const char *));
+
+
+         /********************/
+         /*    Synthesis     */
+         /********************/
+
+
+#define espeakCHARS_AUTO   0
+#define espeakCHARS_UTF8   1
+#define espeakCHARS_8BIT   2
+#define espeakCHARS_WCHAR  3
+#define espeakCHARS_16BIT  4
+
+#define espeakSSML        0x10
+#define espeakPHONEMES    0x100
+#define espeakENDPAUSE    0x1000
+#define espeakKEEP_NAMEDATA 0x2000
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_Synth(const void *text,
+	size_t size,
+	unsigned int position,
+	espeak_POSITION_TYPE position_type,
+	unsigned int end_position,
+	unsigned int flags,
+	unsigned int* unique_identifier,
+	void* user_data);
+/* Synthesize speech for the specified text.  The speech sound data is passed to the calling
+   program in buffers by means of the callback function specified by espeak_SetSynthCallback(). The command is asynchronous: it is internally buffered and returns as soon as possible. If espeak_Initialize was previously called with AUDIO_OUTPUT_PLAYBACK as argument, the sound data are played by eSpeak.
+
+   text: The text to be spoken, terminated by a zero character. It may be either 8-bit characters,
+      wide characters (wchar_t), or UTF8 encoding.  Which of these is determined by the "flags"
+      parameter.
+
+   size: Equal to (or greatrer than) the size of the text data, in bytes.  This is used in order
+      to allocate internal storage space for the text.  This value is not used for
+      AUDIO_OUTPUT_SYNCHRONOUS mode.
+
+   position:  The position in the text where speaking starts. Zero indicates speak from the
+      start of the text.
+
+   position_type:  Determines whether "position" is a number of characters, words, or sentences.
+      Values:
+
+   end_position:  If set, this gives a character position at which speaking will stop.  A value
+      of zero indicates no end position.
+
+   flags:  These may be OR'd together:
+      Type of character codes, one of:
+         espeakCHARS_UTF8     UTF8 encoding
+         espeakCHARS_8BIT     The 8 bit ISO-8859 character set for the particular language.
+         espeakCHARS_AUTO     8 bit or UTF8  (this is the default)
+         espeakCHARS_WCHAR    Wide characters (wchar_t)
+         espeakCHARS_16BIT    16 bit characters.
+
+      espeakSSML   Elements within < > are treated as SSML elements, or if not recognised are ignored.
+
+      espeakPHONEMES  Text within [[ ]] is treated as phonemes codes (in espeak's Kirshenbaum encoding).
+
+      espeakENDPAUSE  If set then a sentence pause is added at the end of the text.  If not set then
+         this pause is suppressed.
+
+   unique_identifier: This must be either NULL, or point to an integer variable to
+       which eSpeak writes a message identifier number.
+       eSpeak includes this number in espeak_EVENT messages which are the result of
+       this call of espeak_Synth().
+
+   user_data: a pointer (or NULL) which will be passed to the callback function in
+       espeak_EVENT messages.
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_Synth_Mark(const void *text,
+	size_t size,
+	const char *index_mark,
+	unsigned int end_position,
+	unsigned int flags,
+	unsigned int* unique_identifier,
+	void* user_data);
+/* Synthesize speech for the specified text.  Similar to espeak_Synth() but the start position is
+   specified by the name of a <mark> element in the text.
+
+   index_mark:  The "name" attribute of a <mark> element within the text which specified the
+      point at which synthesis starts.  UTF8 string.
+
+   For the other parameters, see espeak_Synth()
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_Key(const char *key_name);
+/* Speak the name of a keyboard key.
+   If key_name is a single character, it speaks the name of the character.
+   Otherwise, it speaks key_name as a text string.
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_Char(wchar_t character);
+/* Speak the name of the given character
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+
+
+
+         /***********************/
+         /*  Speech Parameters  */
+         /***********************/
+
+typedef enum {
+  espeakSILENCE=0, /* internal use */
+  espeakRATE=1,
+  espeakVOLUME=2,
+  espeakPITCH=3,
+  espeakRANGE=4,
+  espeakPUNCTUATION=5,
+  espeakCAPITALS=6,
+  espeakWORDGAP=7,
+  espeakOPTIONS=8,   // reserved for misc. options.  not yet used
+  espeakINTONATION=9,
+  espeakSSML_BREAK_MUL=10,
+
+  espeakRESERVED2=11,
+  espeakEMPHASIS,   /* internal use */
+  espeakLINELENGTH, /* internal use */
+  espeakVOICETYPE,  // internal, 1=mbrola
+  N_SPEECH_PARAM    /* last enum */
+} espeak_PARAMETER;
+
+typedef enum {
+  espeakPUNCT_NONE=0,
+  espeakPUNCT_ALL=1,
+  espeakPUNCT_SOME=2
+} espeak_PUNCT_TYPE;
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_SetParameter(espeak_PARAMETER parameter, int value, int relative);
+/* Sets the value of the specified parameter.
+   relative=0   Sets the absolute value of the parameter.
+   relative=1   Sets a relative value of the parameter.
+
+   parameter:
+      espeakRATE:    speaking speed in word per minute.  Values 80 to 450.
+
+      espeakVOLUME:  volume in range 0-200 or more.
+                     0=silence, 100=normal full volume, greater values may produce amplitude compression or distortion
+
+      espeakPITCH:   base pitch, range 0-100.  50=normal
+
+      espeakRANGE:   pitch range, range 0-100. 0-monotone, 50=normal
+
+      espeakPUNCTUATION:  which punctuation characters to announce:
+         value in espeak_PUNCT_TYPE (none, all, some),
+         see espeak_GetParameter() to specify which characters are announced.
+
+      espeakCAPITALS: announce capital letters by:
+         0=none,
+         1=sound icon,
+         2=spelling,
+         3 or higher, by raising pitch.  This values gives the amount in Hz by which the pitch
+            of a word raised to indicate it has a capital letter.
+
+      espeakWORDGAP:  pause between words, units of 10mS (at the default speed)
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API int espeak_GetParameter(espeak_PARAMETER parameter, int current);
+/* current=0  Returns the default value of the specified parameter.
+   current=1  Returns the current value of the specified parameter, as set by SetParameter()
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_SetPunctuationList(const wchar_t *punctlist);
+/* Specified a list of punctuation characters whose names are to be spoken when the
+   value of the Punctuation parameter is set to "some".
+
+   punctlist:  A list of character codes, terminated by a zero character.
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+#define espeakPHONEMES_SHOW    0x01
+#define espeakPHONEMES_IPA     0x02
+#define espeakPHONEMES_TRACE   0x08
+#define espeakPHONEMES_MBROLA  0x10
+#define espeakPHONEMES_TIE     0x80
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API void espeak_SetPhonemeTrace(int phonememode, FILE *stream);
+/* phonememode:  Controls the output of phoneme symbols for the text
+      bits 0-2:
+         value=0  No phoneme output (default)
+         value=1  Output the translated phoneme symbols for the text
+         value=2  as (1), but produces IPA phoneme names rather than ascii
+      bit 3:   output a trace of how the translation was done (showing the matching rules and list entries)
+      bit 4:   produce pho data for mbrola
+      bit 7:   use (bits 8-23) as a tie within multi-letter phonemes names
+      bits 8-23:  separator character, between phoneme names
+
+   stream   output stream for the phoneme symbols (and trace).  If stream=NULL then it uses stdout.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API const char *espeak_TextToPhonemes(const void **textptr, int textmode, int phonememode);
+/* Translates text into phonemes.  Call espeak_SetVoiceByName() first, to select a language.
+
+   It returns a pointer to a character string which contains the phonemes for the text up to
+   end of a sentence, or comma, semicolon, colon, or similar punctuation.
+
+   textptr: The address of a pointer to the input text which is terminated by a zero character.
+      On return, the pointer has been advanced past the text which has been translated, or else set
+      to NULL to indicate that the end of the text has been reached.
+
+   textmode: Type of character codes, one of:
+         espeakCHARS_UTF8     UTF8 encoding
+         espeakCHARS_8BIT     The 8 bit ISO-8859 character set for the particular language.
+         espeakCHARS_AUTO     8 bit or UTF8  (this is the default)
+         espeakCHARS_WCHAR    Wide characters (wchar_t)
+         espeakCHARS_16BIT    16 bit characters.
+
+   phoneme_mode
+	    bit 1:   0=eSpeak's ascii phoneme names, 1= International Phonetic Alphabet (as UTF-8 characters).
+        bit 7:   use (bits 8-23) as a tie within multi-letter phonemes names
+        bits 8-23:  separator character, between phoneme names
+
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API void espeak_CompileDictionary(const char *path, FILE *log, int flags);
+/* Compile pronunciation dictionary for a language which corresponds to the currently
+   selected voice.  The required voice should be selected before calling this function.
+
+   path:  The directory which contains the language's '_rules' and '_list' files.
+          'path' should end with a path separator character ('/').
+   log:   Stream for error reports and statistics information. If log=NULL then stderr will be used.
+
+   flags:  Bit 0: include source line information for debug purposes (This is displayed with the
+          -X command line option).
+*/
+         /***********************/
+         /*   Voice Selection   */
+         /***********************/
+
+
+// voice table
+typedef struct {
+	const char *name;      // a given name for this voice. UTF8 string.
+	const char *languages;       // list of pairs of (byte) priority + (string) language (and dialect qualifier)
+	const char *identifier;      // the filename for this voice within espeak-ng-data/voices
+	unsigned char gender;  // 0=none 1=male, 2=female,
+	unsigned char age;     // 0=not specified, or age in years
+	unsigned char variant; // only used when passed as a parameter to espeak_SetVoiceByProperties
+	unsigned char xx1;     // for internal use
+	int score;       // for internal use
+	void *spare;     // for internal use
+} espeak_VOICE;
+
+/* Note: The espeak_VOICE structure is used for two purposes:
+  1.  To return the details of the available voices.
+  2.  As a parameter to  espeak_SetVoiceByProperties() in order to specify selection criteria.
+
+   In (1), the "languages" field consists of a list of (UTF8) language names for which this voice
+   may be used, each language name in the list is terminated by a zero byte and is also preceded by
+   a single byte which gives a "priority" number.  The list of languages is terminated by an
+   additional zero byte.
+
+   A language name consists of a language code, optionally followed by one or more qualifier (dialect)
+   names separated by hyphens (eg. "en-uk").  A voice might, for example, have languages "en-uk" and
+   "en".  Even without "en" listed, voice would still be selected for the "en" language (because
+   "en-uk" is related) but at a lower priority.
+
+   The priority byte indicates how the voice is preferred for the language. A low number indicates a
+   more preferred voice, a higher number indicates a less preferred voice.
+
+   In (2), the "languages" field consists simply of a single (UTF8) language name, with no preceding
+   priority byte.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API const espeak_VOICE **espeak_ListVoices(espeak_VOICE *voice_spec);
+/* Reads the voice files from espeak-ng-data/voices and creates an array of espeak_VOICE pointers.
+   The list is terminated by a NULL pointer
+
+   If voice_spec is NULL then all voices are listed.
+   If voice spec is given, then only the voices which are compatible with the voice_spec
+   are listed, and they are listed in preference order.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_SetVoiceByFile(const char *filename);
+/* Loads a voice given the file path.  Language is not considered.
+   "filename" is a UTF8 string.
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_SetVoiceByName(const char *name);
+/* Searches for a voice with a matching "name" field.  Language is not considered.
+   "name" is a UTF8 string.
+
+   Return: EE_OK: operation achieved
+           EE_BUFFER_FULL: the command can not be buffered;
+             you may try after a while to call the function again.
+	   EE_INTERNAL_ERROR.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_SetVoiceByProperties(espeak_VOICE *voice_spec);
+/* An espeak_VOICE structure is used to pass criteria to select a voice.  Any of the following
+   fields may be set:
+
+   name     NULL, or a voice name
+
+   languages  NULL, or a single language string (with optional dialect), eg. "en-uk", or "en"
+
+   gender   0=not specified, 1=male, 2=female
+
+   age      0=not specified, or an age in years
+
+   variant  After a list of candidates is produced, scored and sorted, "variant" is used to index
+            that list and choose a voice.
+            variant=0 takes the top voice (i.e. best match). variant=1 takes the next voice, etc
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_VOICE *espeak_GetCurrentVoice(void);
+/* Returns the espeak_VOICE data for the currently selected voice.
+   This is not affected by temporary voice changes caused by SSML elements such as <voice> and <s>
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_Cancel(void);
+/* Stop immediately synthesis and audio output of the current text. When this
+   function returns, the audio output is fully stopped and the synthesizer is ready to
+   synthesize a new message.
+
+   Return: EE_OK: operation achieved
+	   EE_INTERNAL_ERROR.
+*/
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API int espeak_IsPlaying(void);
+/* Returns 1 if audio is played, 0 otherwise.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_Synchronize(void);
+/* This function returns when all data have been spoken.
+   Return: EE_OK: operation achieved
+	   EE_INTERNAL_ERROR.
+*/
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API espeak_ERROR espeak_Terminate(void);
+/* last function to be called.
+   Return: EE_OK: operation achieved
+	   EE_INTERNAL_ERROR.
+*/
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+ESPEAK_API const char *espeak_Info(const char **path_data);
+/* Returns the version number string.
+   path_data  returns the path to espeak_data
+*/
+#endif
--- a/vall_e.cpp/include/ggml-alloc.h
+++ b/vall_e.cpp/include/ggml-alloc.h
@ -0,0 +1,76 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct             ggml_backend * ggml_backend_t;
+
+// Tensor allocator
+struct ggml_tallocr {
+    ggml_backend_buffer_t buffer;
+    void * base;
+    size_t alignment;
+    size_t offset;
+};
+
+GGML_API struct ggml_tallocr ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tensor);
+
+// Graph allocator
+/*
+  Example usage:
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+
+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
+
+    // allocate the graph
+    struct ggml_cgraph * graph = build_graph(batch);
+    ggml_gallocr_alloc_graph(galloc, graph);
+
+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
+
+    // evaluate the graph
+    ggml_backend_graph_compute(backend, graph);
+*/
+
+// special tensor flags for use with the graph allocator:
+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+//   ggml_set_output(): output tensors are never freed and never overwritten
+
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve_n(
+    ggml_gallocr_t galloc,
+    struct ggml_cgraph * graph,
+    const int * node_buffer_ids,
+    const int * leaf_buffer_ids);
+
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+
+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-backend.h
+++ b/vall_e.cpp/include/ggml-backend.h
@ -0,0 +1,352 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#ifdef GGML_BACKEND_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BACKEND_BUILD
+#            define GGML_BACKEND_API __declspec(dllexport) extern
+#        else
+#            define GGML_BACKEND_API __declspec(dllimport) extern
+#        endif
+#    else
+#        define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
+#    endif
+#else
+#    define GGML_BACKEND_API extern
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend_event * ggml_backend_event_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+    typedef struct ggml_backend_reg * ggml_backend_reg_t;
+    typedef struct ggml_backend_device * ggml_backend_dev_t;
+
+
+    //
+    // Backend buffer type
+    //
+
+    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
+    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
+    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
+
+    //
+    // Backend buffer
+    //
+
+    enum ggml_backend_buffer_usage {
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
+    };
+
+    GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
+    GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    //
+    // Backend (stream)
+    //
+
+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    // "offset" refers to the offset in tensor->data for setting/getting data
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+    // NOTE: will be removed, use device version instead
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // asynchronous copy
+    // the copy is performed after all the currently queued operations in backend_src
+    // backend_dst will wait for the copy to complete before performing other operations
+    // automatic fallback to sync copy if async is not supported
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
+
+    //
+    // Events
+    //
+
+    GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
+    GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
+    GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
+    GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
+
+    //
+    // Backend device
+    //
+
+    enum ggml_backend_dev_type {
+        // CPU device using system memory
+        GGML_BACKEND_DEVICE_TYPE_CPU,
+        // GPU device using dedicated memory
+        GGML_BACKEND_DEVICE_TYPE_GPU,
+        // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+        GGML_BACKEND_DEVICE_TYPE_ACCEL
+    };
+
+    // functionality supported by the device
+    struct ggml_backend_dev_caps {
+        // asynchronous operations
+        bool async;
+        // pinned host buffer
+        bool host_buffer;
+        // creating buffers from host ptr
+        bool buffer_from_host_ptr;
+        // event synchronization
+        bool events;
+    };
+
+    // all the device properties
+    struct ggml_backend_dev_props {
+        const char * name;
+        const char * description;
+        size_t memory_free;
+        size_t memory_total;
+        enum ggml_backend_dev_type type;
+        struct ggml_backend_dev_caps caps;
+    };
+
+    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
+    GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
+    GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
+    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
+    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
+    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
+    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
+
+    GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+    GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
+    GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
+
+    //
+    // Backend (reg)
+    //
+
+    GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
+    GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
+    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
+    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
+
+    // Common functions that may be obtained using ggml_backend_reg_get_proc_address
+
+    // Split buffer type for tensor parallelism
+    typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
+    // Set the number of threads for the backend
+    typedef void                         (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
+    // Get additional buffer types provided by the device (returns a NULL-terminated array)
+    typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
+    // Set the abort callback for the backend
+    typedef void                         (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
+    // Get a list of feature flags supported by the backend (returns a NULL-terminated array)
+    struct ggml_backend_feature {
+        const char * name;
+        const char * value;
+    };
+    typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
+
+    //
+    // Backend registry
+    //
+
+    // Backend (reg) enumeration
+    GGML_API size_t             ggml_backend_reg_count(void);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
+    GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
+
+    // Device enumeration
+    GGML_API size_t             ggml_backend_dev_count(void);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
+
+    // Direct backend (stream) initialization
+    // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
+    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
+    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
+    GGML_API ggml_backend_t ggml_backend_init_best(void);
+
+    // Load a backend from a dynamic library and register it
+    GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
+    // Unload a backend if loaded dynamically and unregister it
+    GGML_API void               ggml_backend_unload(ggml_backend_reg_t reg);
+    // Load all known backends from dynamic libraries
+    GGML_API void               ggml_backend_load_all(void);
+    GGML_API void               ggml_backend_load_all_from_path(const char * dir_path);
+
+    //
+    // Backend scheduler
+    //
+
+    // The backend scheduler allows for multiple backend devices to be used together
+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+    // The backends are selected based on:
+    // - the backend that supports the operation
+    // - the location of the pre-allocated tensors (e.g. the weights)
+    /*
+      Example usage:
+
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
+        // preferrably to run on the same backend as the buffer
+        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
+
+        // initialize buffers from a max size graph (optional)
+        reserve_graph = build_graph(sched, max_batch_size);
+
+        // manually assign nodes to a backend (optional, should not be needed in most cases)
+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
+
+        ggml_backend_sched_reserve(sched, reserve_graph);
+
+        // compute
+        graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
+        for (int i = 0; i < 10; ++i) {
+            ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
+        }
+
+        // if there are graph inputs:
+        graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
+        ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
+        ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
+        ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
+        ggml_backend_sched_graph_compute(sched, graph); // execute the graph
+
+        // as an alternative to the above it is also possible to assign the inputs to a dedicated context and
+        // allocate them statically via ggml_backend_alloc_ctx_tensors
+    }
+    */
+
+    typedef struct ggml_backend_sched * ggml_backend_sched_t;
+
+    // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
+    // Initialize a backend scheduler, backends with low index are given priority over backends with high index
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
+    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
+
+    // Initialize backend buffers from a measure graph
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
+
+    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
+
+    // Get the number of splits of the last graph
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
+
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+
+    // Allocate and compute graph on the backend scheduler
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
+
+    // Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
+    // This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
+    // The correct way to use this API is to discard the deallocated tensors and create new ones.
+    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
+
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
+
+    // CPU buffer types are always available
+    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-blas.h
+++ b/vall_e.cpp/include/ggml-blas.h
@ -0,0 +1,25 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
+
+// number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
+GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
+
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-cann.h
+++ b/vall_e.cpp/include/ggml-cann.h
@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "ggml-backend.h"
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Maximum number of CANN devices supported.
+ */
+#define GGML_CANN_MAX_DEVICES 16
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
+
+/**
+ * @brief Initializes the CANN backend for a specified device.
+ *
+ * This function initializes the CANN backend for the given device.
+ * It verifies the device index, allocates a context, and creates a backend
+ * instance.
+ *
+ * @param device The index of the device to initialize.
+ * @return A pointer to the initialized backend instance, or nullptr on failure.
+ */
+GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
+
+/**
+ * @brief Checks if a given backend is a CANN backend.
+ *
+ * This function verifies if the provided backend is a CANN backend by comparing
+ * its GUID with the CANN backend's GUID.
+ *
+ * @param backend The backend instance to check.
+ * @return True if the backend is a CANN backend, false otherwise.
+ */
+GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
+
+/**
+ * @brief Retrieves the CANN buffer type for a specified device.
+ *
+ * This function initializes and returns the buffer type interface associated
+ * with the given device. It ensures thread-safe access using a mutex.
+ *
+ * @param device The device index for which to retrieve the buffer type.
+ * @return A pointer to the buffer type interface for the specified device, or
+ * nullptr if the device index is out of range.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t
+ggml_backend_cann_buffer_type(int32_t device);
+
+/**
+ * @brief Retrieves the number of CANN devices available.
+ *
+ * This function returns the number of CANN devices available based on
+ * information obtained from `ggml_cann_info()`.
+ *
+ * @return The number of CANN devices available.
+ */
+GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
+
+/**
+ * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
+ *
+ * @return A pointer to the host buffer type interface.
+ */
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
+
+/**
+ * @brief Retrieves the description of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the SoC name,
+ * and writes it into the provided description buffer.
+ *
+ * @param device The device index to retrieve the description for.
+ * @param description Pointer to a buffer where the description will be written.
+ * @param description_size Size of the description buffer.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_description(
+    int32_t device, char* description, size_t description_size);
+
+/**
+ * @brief Retrieves the memory information of a specific CANN device.
+ *
+ * This function sets the specified device, retrieves the free and total
+ * memory information of the specified type (ACL_HBM_MEM), and stores them
+ * in the provided pointers.
+ *
+ * @param device The device index to retrieve memory information for.
+ * @param free Pointer to a variable where the free memory size will be stored.
+ * @param total Pointer to a variable where the total memory size will be
+ * stored.
+ */
+GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
+                                                  size_t* free,
+                                                  size_t* total);
+
+#ifdef __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-cpp.h
+++ b/vall_e.cpp/include/ggml-cpp.h
@ -0,0 +1,38 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include <memory>
+
+// Smart pointers for ggml types
+
+// ggml
+
+struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
+struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
+
+typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
+typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
+
+// ggml-alloc
+
+struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
+
+typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
+
+// ggml-backend
+
+struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
+struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
+struct ggml_backend_event_deleter  { void operator()(ggml_backend_event_t event)   { ggml_backend_event_free(event); } };
+struct ggml_backend_sched_deleter  { void operator()(ggml_backend_sched_t sched)   { ggml_backend_sched_free(sched); } };
+
+typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
+typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
+typedef std::unique_ptr<ggml_backend_event,  ggml_backend_event_deleter>  ggml_backend_event_ptr;
+typedef std::unique_ptr<ggml_backend_sched,  ggml_backend_sched_deleter>  ggml_backend_sched_ptr;
--- a/vall_e.cpp/include/ggml-cpu.h
+++ b/vall_e.cpp/include/ggml-cpu.h
@ -0,0 +1,135 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+        struct ggml_threadpool * threadpool;
+
+        // abort ggml_graph_compute when true
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
+    };
+
+    // numa strategies
+    enum ggml_numa_strategy {
+        GGML_NUMA_STRATEGY_DISABLED   = 0,
+        GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
+        GGML_NUMA_STRATEGY_ISOLATE    = 2,
+        GGML_NUMA_STRATEGY_NUMACTL    = 3,
+        GGML_NUMA_STRATEGY_MIRROR     = 4,
+        GGML_NUMA_STRATEGY_COUNT
+    };
+
+    GGML_BACKEND_API void    ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
+    GGML_BACKEND_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_BACKEND_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_BACKEND_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_BACKEND_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_BACKEND_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_BACKEND_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params  * params);
+    GGML_BACKEND_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_BACKEND_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
+                  const struct ggml_cgraph * cgraph,
+                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+                    struct ggml_threadpool * threadpool /* = NULL */ );
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_BACKEND_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    //
+    // system info
+    //
+
+    // x86
+    GGML_BACKEND_API int ggml_cpu_has_sse3       (void);
+    GGML_BACKEND_API int ggml_cpu_has_ssse3      (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx_vnni   (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx2       (void);
+    GGML_BACKEND_API int ggml_cpu_has_f16c       (void);
+    GGML_BACKEND_API int ggml_cpu_has_fma        (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512     (void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
+    GGML_BACKEND_API int ggml_cpu_has_amx_int8   (void);
+    // ARM
+    GGML_BACKEND_API int ggml_cpu_has_neon       (void);
+    GGML_BACKEND_API int ggml_cpu_has_arm_fma    (void);
+    GGML_BACKEND_API int ggml_cpu_has_fp16_va    (void);
+    GGML_BACKEND_API int ggml_cpu_has_dotprod    (void);
+    GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
+    GGML_BACKEND_API int ggml_cpu_has_sve        (void);
+    GGML_BACKEND_API int ggml_cpu_get_sve_cnt    (void);  // sve vector length in bytes
+    // other
+    GGML_BACKEND_API int ggml_cpu_has_riscv_v    (void);
+    GGML_BACKEND_API int ggml_cpu_has_vsx        (void);
+    GGML_BACKEND_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_BACKEND_API int ggml_cpu_has_llamafile  (void);
+
+    // Internal types and functions exposed for tests and benchmarks
+
+    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                       const void * GGML_RESTRICT y, size_t by, int nrc);
+
+    struct ggml_type_traits_cpu {
+        ggml_from_float_t        from_float;
+        ggml_vec_dot_t           vec_dot;
+        enum ggml_type           vec_dot_type;
+        int64_t                  nrows; // number of rows to process simultaneously
+    };
+
+    GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
+
+    GGML_BACKEND_API void ggml_cpu_init(void);
+
+    //
+    // CPU backend
+    //
+
+    GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_BACKEND_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_BACKEND_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_BACKEND_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
+    GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+
+    GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-cuda.h
+++ b/vall_e.cpp/include/ggml-cuda.h
@ -0,0 +1,47 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#ifdef GGML_USE_HIP
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#elif defined(GGML_USE_MUSA)
+#define GGML_CUDA_NAME "MUSA"
+#define GGML_CUBLAS_NAME "muBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+#define GGML_CUDA_MAX_DEVICES       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+
+// device buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+
+GGML_BACKEND_API int  ggml_backend_cuda_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-kompute.h
+++ b/vall_e.cpp/include/ggml-kompute.h
@ -0,0 +1,50 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_KOMPUTE_MAX_DEVICES 16
+
+struct ggml_vk_device {
+    int index;
+    int type; // same as VkPhysicalDeviceType
+    size_t heapSize;
+    const char * name;
+    const char * vendor;
+    int subgroupSize;
+    uint64_t bufferAlignment;
+    uint64_t maxAlloc;
+};
+
+struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
+bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
+bool ggml_vk_has_vulkan(void);
+bool ggml_vk_has_device(void);
+struct ggml_vk_device ggml_vk_current_device(void);
+
+//
+// backend API
+//
+
+// forward declaration
+typedef struct ggml_backend * ggml_backend_t;
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-metal.h
+++ b/vall_e.cpp/include/ggml-metal.h
@ -0,0 +1,66 @@
+// Note: this description is outdated
+//
+// An interface allowing to compute ggml_cgraph with Metal
+//
+// This is a fully functional interface that extends ggml with GPU support for Apple devices.
+// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, etc.)
+//
+// How it works?
+//
+// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
+// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
+// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
+//
+// You only need to make sure that all memory buffers that you used during the graph creation
+// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
+// used during the graph evaluation to determine the arguments of the compute kernels.
+//
+// Synchronization between device and host memory (for example for input and output tensors)
+// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stddef.h>
+#include <stdbool.h>
+
+struct ggml_tensor;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
+GGML_DEPRECATED(
+        GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
+        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
+
+GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+
+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+
+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
+
+#ifdef __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-opencl.h
+++ b/vall_e.cpp/include/ggml-opencl.h
@ -0,0 +1,26 @@
+#ifndef GGML_OPENCL_H
+#define GGML_OPENCL_H
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+//
+// backend API
+//
+GGML_BACKEND_API ggml_backend_t ggml_backend_opencl_init(void);
+GGML_BACKEND_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_opencl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
+
+#endif // GGML_OPENCL_H
--- a/vall_e.cpp/include/ggml-opt.h
+++ b/vall_e.cpp/include/ggml-opt.h
@ -0,0 +1,216 @@
+// This file contains functionality for training models using GGML.
+// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
+// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
+//
+// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdint.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    struct ggml_opt_dataset;
+    struct ggml_opt_context;
+    struct ggml_opt_result;
+
+    typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
+    typedef struct ggml_opt_context * ggml_opt_context_t;
+    typedef struct ggml_opt_result  * ggml_opt_result_t;
+
+    // ====== Loss ======
+
+    // built-in loss types, i.e. the built-in quantities minimized by the optimizer
+    // custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
+    enum ggml_opt_loss_type {
+        GGML_OPT_LOSS_TYPE_MEAN,
+        GGML_OPT_LOSS_TYPE_SUM,
+        GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
+        GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
+    };
+
+    // ====== Dataset ======
+
+    GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
+            int64_t ne_datapoint, // number of elements per datapoint
+            int64_t ne_label,     // number of elements per label
+            int64_t ndata,        // total number of datapoints/labels
+            int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
+    GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
+
+    // get underlying tensors that store the data
+    GGML_API struct ggml_tensor * ggml_opt_dataset_data  (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
+    GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label,     ndata]
+
+    // shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
+    GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
+
+    // get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
+    GGML_API void ggml_opt_dataset_get_batch(
+            ggml_opt_dataset_t   dataset,
+            struct ggml_tensor * data_batch,   // shape = [ne_datapoint, ndata_batch]
+            struct ggml_tensor * labels_batch, // shape = [ne_label,     ndata_batch]
+            int64_t              ibatch);
+
+    // ====== Model / Context ======
+
+    enum ggml_opt_build_type {
+        GGML_OPT_BUILD_TYPE_FORWARD,
+        GGML_OPT_BUILD_TYPE_GRAD,
+        GGML_OPT_BUILD_TYPE_OPT,
+    };
+
+    // parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
+    struct ggml_opt_optimizer_params {
+        // AdamW optimizer parameters
+        struct {
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float wd;    // weight decay for AdamW, use 0.0f to disable
+        } adamw;
+    };
+
+    // callback to calculate optimizer parameters prior to a backward pass
+    // userdata can be used to pass arbitrary data
+    typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
+
+    // returns the default optimizer params (constant)
+    // userdata is not used
+    GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
+
+    // parameters for initializing a new optimization context
+    struct ggml_opt_params {
+        ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
+
+        struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
+
+        // the forward graph is defined by inputs and outputs
+        // those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
+        struct ggml_tensor * inputs;
+        struct ggml_tensor * outputs;
+
+        enum ggml_opt_loss_type  loss_type;
+        enum ggml_opt_build_type build_type;
+
+        int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
+
+        ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
+        void * get_opt_pars_ud;                     // userdata for calculating optimizer parameters
+    };
+
+    // get parameters for an optimization context with defaults set where possible
+    // parameters for which no sensible defaults exist are supplied as arguments to this function
+    GGML_API ggml_opt_params ggml_opt_default_params(
+            ggml_backend_sched_t      backend_sched,
+            struct ggml_context     * ctx_compute,
+            struct ggml_tensor      * inputs,
+            struct ggml_tensor      * outputs,
+            enum ggml_opt_loss_type   loss_type);
+
+    GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
+    GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
+
+    // set gradients to zero, initilize loss, and optionally reset the optimizer
+    GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
+
+    // get underlying tensors that store data
+    GGML_API struct ggml_tensor * ggml_opt_inputs(  ggml_opt_context_t opt_ctx); // forward graph input tensor
+    GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
+    GGML_API struct ggml_tensor * ggml_opt_labels(  ggml_opt_context_t opt_ctx); // labels to compare outputs against
+    GGML_API struct ggml_tensor * ggml_opt_loss(    ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
+    GGML_API struct ggml_tensor * ggml_opt_pred(    ggml_opt_context_t opt_ctx); // predictions made by outputs
+    GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
+
+    GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
+
+    // ====== Optimization Result ======
+
+    GGML_API ggml_opt_result_t ggml_opt_result_init();
+    GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
+    GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
+
+    // get data from result, uncertainties are optional and can be ignored by passing NULL
+    GGML_API void ggml_opt_result_ndata(   ggml_opt_result_t result, int64_t * ndata);                  // writes 1 value, number of datapoints
+    GGML_API void ggml_opt_result_loss(    ggml_opt_result_t result, double  * loss,     double * unc); // writes 1 value
+    GGML_API void ggml_opt_result_pred(    ggml_opt_result_t result, int32_t * pred);                   // writes ndata values
+    GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double  * accuracy, double * unc); // writes 1 value
+
+    // ====== Computation ======
+
+    // do forward pass, increment result if not NULL
+    GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // do forward pass, increment result if not NULL, do backward pass
+    GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
+
+    // ############################################################################
+    // ## The high-level functions start here. They do not depend on any private ##
+    // ## functions or structs and can be copied to and adapted for user code.   ##
+    // ############################################################################
+
+    // ====== Intended Usage ======
+    //
+    // 1. Select the appropriate loss for your problem.
+    // 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
+    //    Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
+    // 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
+    //    The first context should contain the model parameters and inputs and be allocated statically in user code.
+    //    The second context should contain all other tensors and will be (re)allocated automatically.
+    //    Due to this automated allocation the data of the second context is not defined when accessed in user code.
+    //    Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
+    // 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
+
+    // signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
+    typedef void (*ggml_opt_epoch_callback)(
+            bool               train,       // true after training evaluation, false after validation evaluation
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,      // result associated with the dataset subsection
+            int64_t            ibatch,      // number of batches that have been evaluated so far
+            int64_t            ibatch_max,  // total number of batches in this dataset subsection
+            int64_t            t_start_us); // time at which the evaluation on the dataset subsection was started
+
+    // do training on front of dataset, do evaluation only on back of dataset
+    GGML_API void ggml_opt_epoch(
+            ggml_opt_context_t      opt_ctx,
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,   // result to increment during training, ignored if NULL
+            ggml_opt_result_t       result_eval,    // result to increment during evaluation, ignored if NULL
+            int64_t                 idata_split,    // data index at which to split training and evaluation
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    // callback that prints a progress bar on stderr
+    GGML_API void ggml_opt_epoch_callback_progress_bar(
+            bool               train,
+            ggml_opt_context_t opt_ctx,
+            ggml_opt_dataset_t dataset,
+            ggml_opt_result_t  result,
+            int64_t            ibatch,
+            int64_t            ibatch_max,
+            int64_t            t_start_us);
+
+    // fit model defined by inputs and outputs to dataset
+    GGML_API void ggml_opt_fit(
+            ggml_backend_sched_t            backend_sched,  // backend scheduler for constructing the compute graphs
+            ggml_context                  * ctx_compute,    // context with temporarily allocated tensors to calculate the outputs
+            ggml_tensor                   * inputs,         // input tensor with shape [ne_datapoint, ndata_batch]
+            ggml_tensor                   * outputs,        // output tensor, must have shape [ne_label, ndata_batch] if labels are used
+            ggml_opt_dataset_t              dataset,        // dataset with data and optionally also labels
+            enum ggml_opt_loss_type         loss_type,      // loss to minimize
+            ggml_opt_get_optimizer_params   get_opt_pars,   // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
+            int64_t                         nepoch,         // how many times the dataset should be iterated over
+            int64_t                         nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
+            float                           val_split,      // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
+            bool                            silent);        // whether or not info prints to stderr should be suppressed
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-rpc.h
+++ b/vall_e.cpp/include/ggml-rpc.h
@ -0,0 +1,28 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_RPC_MAX_SERVERS       16
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+
+GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+
+GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
+
+GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-sycl.h
+++ b/vall_e.cpp/include/ggml-sycl.h
@ -0,0 +1,49 @@
+//
+//  MIT license
+//  Copyright (C) 2024 Intel Corporation
+//  SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#define GGML_SYCL_NAME "SYCL"
+#define GGML_SYCL_MAX_DEVICES 48
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
+
+GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+
+// devide buffer
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+
+GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
+                                                       char *description,
+                                                       size_t description_size);
+GGML_BACKEND_API int  ggml_backend_sycl_get_device_count();
+GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+
+// SYCL doesn't support registering host memory, keep here for reference
+// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml-vulkan.h
+++ b/vall_e.cpp/include/ggml-vulkan.h
@ -0,0 +1,31 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_VK_NAME "Vulkan"
+#define GGML_VK_MAX_DEVICES 16
+
+GGML_BACKEND_API void ggml_vk_instance_init(void);
+
+// backend API
+GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+
+GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_BACKEND_API int  ggml_backend_vk_get_device_count(void);
+GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/vall_e.cpp/include/ggml.h
+++ b/vall_e.cpp/include/ggml.h
--- a/vall_e.cpp/include/llama-cpp.h
+++ b/vall_e.cpp/include/llama-cpp.h
@ -0,0 +1,25 @@
+#pragma once
+
+#ifndef __cplusplus
+#error "This header is for C++ only"
+#endif
+
+#include <memory>
+
+#include "llama.h"
+
+struct llama_model_deleter {
+    void operator()(llama_model * model) { llama_free_model(model); }
+};
+
+struct llama_context_deleter {
+    void operator()(llama_context * context) { llama_free(context); }
+};
+
+struct llama_sampler_deleter {
+    void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
+};
+
+typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
+typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
+typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
--- a/vall_e.cpp/include/llama-impl.h
+++ b/vall_e.cpp/include/llama-impl.h
@ -0,0 +1,181 @@
+#pragma once
+
+#include "llama.h"
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_ATTRIBUTE_FORMAT(...)
+#endif
+
+//
+// logging
+//
+
+LLAMA_ATTRIBUTE_FORMAT(2, 3)
+void llama_log_internal        (ggml_log_level level, const char * format, ...);
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
+
+#define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
+#define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+#define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
+
+//
+// helpers
+//
+
+struct time_meas {
+    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+    ~time_meas() {
+        if (t_start_us >= 0) {
+            t_acc += ggml_time_us() - t_start_us;
+        }
+    }
+
+    const int64_t t_start_us;
+
+    int64_t & t_acc;
+};
+
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
+    struct llama_context * ctx
+);
+
+// the ring buffer works similarly to std::deque, but with a fixed capacity
+template<typename T>
+struct ring_buffer {
+    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
+
+    T & front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    const T & front() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[first];
+    }
+
+    T & back() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    const T & back() const {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        return data[pos];
+    }
+
+    void push_back(const T & value) {
+        if (capacity == 0) {
+            throw std::runtime_error("ring buffer: capacity is zero");
+        }
+
+        if (sz == capacity) {
+            // advance the start when buffer is full
+            first = (first + 1) % capacity;
+        } else {
+            sz++;
+        }
+        data[pos] = value;
+        pos = (pos + 1) % capacity;
+    }
+
+    T pop_front() {
+        if (sz == 0) {
+            throw std::runtime_error("ring buffer is empty");
+        }
+        T value = data[first];
+        first = (first + 1) % capacity;
+        sz--;
+        return value;
+    }
+
+    //T & operator[](size_t i) {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    //const T & at(size_t i) const {
+    //    if (i >= sz) {
+    //        throw std::runtime_error("ring buffer: index out of bounds");
+    //    }
+    //    return data[(first + i) % capacity];
+    //}
+
+    const T & rat(size_t i) const {
+        if (i >= sz) {
+            throw std::runtime_error("ring buffer: index out of bounds");
+        }
+        return data[(first + sz - i - 1) % capacity];
+    }
+
+    std::vector<T> to_vector() const {
+        std::vector<T> result;
+        result.reserve(sz);
+        for (size_t i = 0; i < sz; i++) {
+            result.push_back(data[(first + i) % capacity]);
+        }
+        return result;
+    }
+
+    void clear() {
+        // here only reset the status of the buffer
+        sz = 0;
+        first = 0;
+        pos = 0;
+    }
+
+    bool empty() const {
+        return sz == 0;
+    }
+
+    size_t size() const {
+        return sz;
+    }
+
+    size_t capacity = 0;
+    size_t sz = 0;
+    size_t first = 0;
+    size_t pos = 0;
+    std::vector<T> data;
+};
--- a/vall_e.cpp/include/llama-vocab.h
+++ b/vall_e.cpp/include/llama-vocab.h
@ -0,0 +1,170 @@
+#pragma once
+
+#include "llama-impl.h"
+
+#include <string>
+#include <vector>
+#include <unordered_map>
+#include <map>
+#include <set>
+
+struct llm_tokenizer;
+
+struct llama_vocab {
+    using id    = llama_token;
+    using token = std::string;
+    using tattr = llama_token_attr;
+
+    struct token_data {
+        token text;
+        float score;
+        tattr attr;
+    };
+
+    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
+
+    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
+    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
+
+    int max_token_len = 0; // used for optimizing longest token search
+
+    std::unordered_map<token, id> token_to_id;
+    std::vector<token_data>       id_to_token;
+
+    std::vector<id>    cache_special_tokens;
+    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
+
+    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
+
+    // default LLaMA special tokens
+    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
+    id special_bos_id  = 1;
+    id special_eos_id  = 2;
+    id special_eot_id  = LLAMA_TOKEN_NULL;
+    id special_eom_id  = LLAMA_TOKEN_NULL;
+    id special_unk_id  = 0;
+    id special_sep_id  = LLAMA_TOKEN_NULL;
+    id special_pad_id  = LLAMA_TOKEN_NULL;
+    id special_cls_id  = LLAMA_TOKEN_NULL;
+    id special_mask_id = LLAMA_TOKEN_NULL;
+
+    id linefeed_id = 13;
+
+    // fim tokens
+    id special_fim_pre_id = LLAMA_TOKEN_NULL;
+    id special_fim_suf_id = LLAMA_TOKEN_NULL;
+    id special_fim_mid_id = LLAMA_TOKEN_NULL;
+    id special_fim_pad_id = LLAMA_TOKEN_NULL;
+    id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
+    id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
+
+    // set of all tokens that cause "end of generation"
+    std::set<id> special_eog_ids;
+
+    // tokenizer flags
+    bool tokenizer_add_space_prefix           = false;
+    bool tokenizer_add_bos                    = false;
+    bool tokenizer_add_eos                    = false;
+    bool tokenizer_ignore_merges              = false;
+    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
+    bool tokenizer_remove_extra_whitespaces   = false;
+    bool tokenizer_escape_whitespaces         = true;
+    bool tokenizer_treat_whitespace_as_suffix = false;
+
+    std::vector<char> precompiled_charsmap;
+
+    llm_tokenizer * tokenizer = nullptr;
+
+    llama_vocab() = default;
+    ~llama_vocab();
+
+    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
+
+    void init_tokenizer();
+};
+
+//
+// internal API
+//
+
+// TODO: rename to llama_tokenize_impl
+// TODO: This should probably be in llama.h
+std::vector<llama_vocab::id> llama_tokenize_internal(
+        const llama_vocab & vocab,
+        std::string raw_text,
+        bool add_special,
+        bool parse_special = false);
+
+// TODO: move the API below as member functions of llama_vocab
+llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
+
+const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
+
+float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
+
+llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
+
+bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
+
+bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
+
+llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
+llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
+llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
+llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
+llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
+llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
+
+llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
+llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
+llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
+
+llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
+llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
+
+bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
+bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
+
+int32_t llama_tokenize_impl(
+        const struct llama_vocab & vocab,
+                      const char * text,
+                         int32_t   text_len,
+                     llama_token * tokens,
+                         int32_t   n_tokens_max,
+                            bool   add_special,
+                            bool   parse_special);
+
+// does not write null-terminator to buf
+int32_t llama_token_to_piece_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token,
+                            char * buf,
+                         int32_t   length,
+                         int32_t   lstrip,
+                            bool   special);
+
+// check if token0 is contained as a prefix in token1
+bool llama_token_is_prefix_impl(
+        const struct llama_vocab & vocab,
+                     llama_token   token0,
+                     llama_token   token1);
+
+int32_t llama_detokenize_impl(
+        const struct llama_vocab & vocab,
+               const llama_token * tokens,
+                         int32_t   n_tokens,
+                            char * text,
+                         int32_t   text_len_max,
+                            bool   remove_special,
+                            bool   unparse_special);
+
+std::string llama_detokenize(
+        const struct llama_vocab & vocab,
+  const std::vector<llama_token> & tokens,
+                            bool   special);
--- a/vall_e.cpp/include/llama.h
+++ b/vall_e.cpp/include/llama.h
--- a/vall_e.cpp/include/llama.modified.h
+++ b/vall_e.cpp/include/llama.modified.h
--- a/vall_e.cpp/include/llama.vanilla.h
+++ b/vall_e.cpp/include/llama.vanilla.h
--- a/vall_e.cpp/include/lstm.h
+++ b/vall_e.cpp/include/lstm.h
@ -0,0 +1,78 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#include "ops.h"
+
+struct encodec_lstm {
+    struct ggml_tensor *l0_ih_w;
+    struct ggml_tensor *l0_hh_w;
+
+    struct ggml_tensor *l0_ih_b;
+    struct ggml_tensor *l0_hh_b;
+
+    struct ggml_tensor *l1_ih_w;
+    struct ggml_tensor *l1_hh_w;
+
+    struct ggml_tensor *l1_ih_b;
+    struct ggml_tensor *l1_hh_b;
+};
+
+struct ggml_tensor *forward_pass_lstm_unilayer(struct ggml_context *ctx0,
+                                               struct ggml_tensor  *inp,
+                                               struct ggml_tensor  *weight_ih,
+                                               struct ggml_tensor  *weight_hh,
+                                               struct ggml_tensor  *bias_ih,
+                                               struct ggml_tensor  *bias_hh,
+                                               char                *prefix) {
+    const int seq_length = inp->ne[0];
+    const int input_dim  = inp->ne[1];
+    const int hidden_dim = weight_ih->ne[1] / 4;
+
+    char ct_name[10];
+    char ht_name[10];
+
+    snprintf(ct_name, 10, "%s_ct", prefix);
+    snprintf(ht_name, 10, "%s_ht", prefix);
+
+    struct ggml_tensor *hs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_dim, seq_length);
+    ggml_set_input(hs);
+
+    struct ggml_tensor *c_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
+    ggml_set_input(c_t);
+    ggml_set_name(c_t, ct_name);
+
+    struct ggml_tensor *h_t = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, hidden_dim);
+    ggml_set_input(h_t);
+    ggml_set_name(h_t, ht_name);
+
+    struct ggml_tensor *current = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+
+    for (int t = 0; t < seq_length; t++) {
+        struct ggml_tensor *x_t = ggml_view_1d(ctx0, current, input_dim, t * current->nb[1]);
+
+        struct ggml_tensor *inp_gates = ggml_mul_mat(ctx0, weight_ih, x_t);
+        inp_gates = ggml_add(ctx0, inp_gates, bias_ih);
+
+        struct ggml_tensor *hid_gates = ggml_mul_mat(ctx0, weight_hh, h_t);
+        hid_gates = ggml_add(ctx0, hid_gates, bias_hh);
+
+        struct ggml_tensor *out_gates = ggml_add(ctx0, inp_gates, hid_gates);
+
+        struct ggml_tensor *i_t = ggml_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 0 * sizeof(float) * hidden_dim));
+        struct ggml_tensor *f_t = ggml_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 1 * sizeof(float) * hidden_dim));
+        struct ggml_tensor *g_t = ggml_tanh(ctx0   , ggml_view_1d(ctx0, out_gates, hidden_dim, 2 * sizeof(float) * hidden_dim));
+        struct ggml_tensor *o_t = ggml_sigmoid(ctx0, ggml_view_1d(ctx0, out_gates, hidden_dim, 3 * sizeof(float) * hidden_dim));
+
+        c_t = ggml_add(ctx0, ggml_mul(ctx0, f_t, c_t), ggml_mul(ctx0, i_t, g_t));
+
+        h_t = ggml_mul(ctx0, o_t, ggml_tanh(ctx0, c_t));
+
+        hs = ggml_set_1d(ctx0, hs, h_t, t * hs->nb[1]);
+    }
+
+    hs = ggml_cont(ctx0, ggml_transpose(ctx0, hs));
+
+    return hs;
+}
--- a/vall_e.cpp/include/ops.h
+++ b/vall_e.cpp/include/ops.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "ggml.h"
+
+struct ggml_tensor *pad_1d(struct ggml_context *ctx0, struct ggml_tensor *inp,
+                           int padding_left, int padding_right);
+
+struct ggml_tensor *unpad_1d(struct ggml_context *ctx0, struct ggml_tensor *inp,
+                             int padding_left, int padding_right);
+
+struct ggml_tensor *strided_conv_1d(struct ggml_context *ctx0, struct ggml_tensor *inp,
+                                    struct ggml_tensor *conv_w, struct ggml_tensor *conv_b,
+                                    int stride);
+
+struct ggml_tensor *strided_conv_transpose_1d(struct ggml_context *ctx0, struct ggml_tensor *inp,
+                                              struct ggml_tensor *conv_w, struct ggml_tensor *conv_b,
+                                              int stride);
--- a/vall_e.cpp/include/quantizer.h
+++ b/vall_e.cpp/include/quantizer.h
@ -0,0 +1,111 @@
+#pragma once
+
+#include <cassert>
+#include <vector>
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+
+#include "utils.h"
+
+struct encodec_quant_block {
+    struct ggml_tensor *embed;
+};
+
+struct encodec_quantizer {
+    std::vector<encodec_quant_block> blocks;
+};
+
+struct ggml_tensor *encodec_forward_quantizer_encode(
+    const struct encodec_quantizer *quantizer, struct ggml_context *ctx0,
+    struct ggml_tensor *encoded_inp, const int n_bins, const int sr, const int bandwidth,
+    const int hop_length) {
+
+    if (!encoded_inp) {
+        fprintf(stderr, "%s: null input tensor\n", __func__);
+        return NULL;
+    }
+
+    const int frame_rate = (int)ceilf(sr / hop_length);
+    const int n_q = get_num_quantizers_for_bandwidth(n_bins, frame_rate, bandwidth);
+
+    const int seq_length = encoded_inp->ne[0];
+
+    struct ggml_tensor *codes = ggml_new_tensor_2d(ctx0, GGML_TYPE_I32, seq_length, n_q);
+    ggml_set_input(codes);
+
+    struct ggml_tensor *inpL = ggml_cont(ctx0, ggml_transpose(ctx0, encoded_inp));
+    struct ggml_tensor *residual = inpL;
+    struct ggml_tensor *indices;
+
+    for (int i = 0; i < n_q; i++) {
+        encodec_quant_block block = quantizer->blocks[i];
+
+        // compute distance
+        // [seq_length, n_bins]
+        struct ggml_tensor *dp = ggml_scale(
+            ctx0, ggml_mul_mat(ctx0, block.embed, residual), -2.0f);
+
+        // [n_bins]
+        struct ggml_tensor *sqr_embed = ggml_sqr(ctx0, block.embed);
+        struct ggml_tensor *sqr_embed_nrm = ggml_sum_rows(ctx0, sqr_embed);
+
+        // [seq_length]
+        struct ggml_tensor *sqr_inp = ggml_sqr(ctx0, residual);
+        struct ggml_tensor *sqr_inp_nrm = ggml_sum_rows(ctx0, sqr_inp);
+
+        // [seq_length, n_bins]
+        struct ggml_tensor *dist = ggml_add(ctx0, ggml_repeat(ctx0, sqr_inp_nrm, dp), dp);
+        dist = ggml_add(ctx0, ggml_repeat(ctx0, ggml_transpose(ctx0, sqr_embed_nrm), dist), dist);
+        dist = ggml_neg(ctx0, dist);
+
+        // take the argmax over the column dimension
+        // [seq_length]
+        indices = ggml_argmax(ctx0, dist);
+
+        // look up in embedding table
+        struct ggml_tensor *quantized = ggml_get_rows(ctx0, block.embed, indices);
+
+        residual = ggml_sub(ctx0, residual, quantized);
+
+        codes = ggml_set_1d(ctx0, codes, indices, i * codes->nb[1]);
+    }
+
+    return codes;
+}
+
+struct ggml_tensor *encodec_forward_quantizer_decode(
+    const struct encodec_quantizer *quantizer, struct ggml_context *ctx0,
+    struct ggml_tensor *codes, const int hidden_dim, const int n_bins, const int sr, const int bandwidth,
+    const int hop_length) {
+
+    if (!codes) {
+        fprintf(stderr, "%s: null input tensor\n", __func__);
+        return NULL;
+    }
+
+    const int seq_length = codes->ne[0];
+
+    const int frame_rate = (int)ceilf(sr / hop_length);
+    const int n_q = get_num_quantizers_for_bandwidth(n_bins, frame_rate, bandwidth);
+
+    assert(n_q == codes->ne[1]);
+
+    struct ggml_tensor *quantized_out = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, hidden_dim, seq_length);
+    ggml_set_input(quantized_out);
+    ggml_set_name(quantized_out, "quantized_out");
+
+    for (int i = 0; i < n_q; i++) {
+        encodec_quant_block block = quantizer->blocks[i];
+
+        struct ggml_tensor *indices = ggml_view_1d(ctx0, codes, seq_length, i * codes->nb[1]);
+        struct ggml_tensor *quantized = ggml_get_rows(ctx0, block.embed, indices);
+
+        quantized_out = ggml_add(ctx0, quantized_out, quantized);
+    }
+
+    quantized_out = ggml_cont(ctx0, ggml_transpose(ctx0, quantized_out));
+
+    return quantized_out;
+}
--- a/vall_e.cpp/include/utils.h
+++ b/vall_e.cpp/include/utils.h
@ -0,0 +1,30 @@
+#pragma once
+
+#include <cstddef>
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+const size_t MB = 1024 * 1024;
+
+template <typename T>
+void read_safe(std::ifstream &infile, T &dest) {
+    infile.read((char *)&dest, sizeof(T));
+}
+
+int32_t get_num_codebooks(float bandwidth, int hop_length, float sample_rate) {
+    // The number of codebooks is determined by the bandwidth selected.
+    // Supported bandwidths are 1.5kbps (n_q = 2), 3 kbps (n_q = 4), 6 kbps (n_q = 8),
+    // 12 kbps (n_q = 16) and 24kbps (n_q = 32).
+    return (int32_t)ceilf(1000 * bandwidth / (ceilf(sample_rate / hop_length) * 10));
+}
+
+int32_t get_bandwidth_per_quantizer(int bins, float frame_rate) {
+    return log2f((float)bins) * frame_rate;
+}
+
+int32_t get_num_quantizers_for_bandwidth(int bins, float frame_rate, float bandwidth) {
+    float bw_per_q = get_bandwidth_per_quantizer(bins, frame_rate);
+    int32_t n_q = MAX(1, floorf(bandwidth * 1000 / bw_per_q));
+    return n_q;
+}
--- a/vall_e.cpp/vall_e.cpp
+++ b/vall_e.cpp/vall_e.cpp
@ -6,7 +6,10 @@
 #include <cstring>
 #include <iostream>
 #include <algorithm>
+#include <regex>
+#include <codecvt>

+// this technically can be used to initialize the map directly
 io_t io_ranges[] = {
 	{ "text", 0, 256, 9, }, 
 	{ "rvq_l", 256, 264, -1, }, 
@ -36,6 +39,18 @@ io_t io_ranges[] = {
 	{ "resps|NAR:0:0", 16677, 17702, 8 }, 
 };

+// stored here because I tokenize the merges
+// I can't be assed to figure out the tokenizer right now
+// u32string because encoding agony
+std::unordered_map<std::u32string, token_t> vocab = {	
+	{U"<unk>",0},{U"<bos>",1},{U"</eos>",2},{U"<mask>",3},{U" ",4},{U"ᵝ",4},{U"!",5},{U"\"",6},{U"(",7},{U"{",7},{U"[",7},{U")",8},{U"}",8},{U"]",8},{U",",9},{U"-",10},{U".",11},{U"1",211},{U"—",10},{U"“",6},{U"”",81},{U"ˇ",6},{U"ˉ",12},{U"ˊ",79},{U"ˋ",80},{U"_",81},{U":",13},{U";",14},{U"?",15},{U"a",16},{U"ä",16},{U"ɒ",16},{U"b",17},{U"c",18},{U"d",19},{U"e",20},{U"f",21},{U"h",22},{U"i",23},{U"ĩ",23},{U"j",24},{U"k",25},{U"l",26},{U"m",27},{U"n",28},{U"ɴ",28},{U"ɲ",28},{U"o",29},{U"̞",29},{U"p",30},{U"ɸ",30},{U"q",31},{U"r",32},{U"ɽ",32},{U"ʁ",32},{U"s",33},{U"t",34},{U"u",35},{U"ø",35},{U"œ",35},{U"y",35},{U"ɣ",35},{U"ũ",35},{U"v",36},{U"w",37},{U"ʍ",37},{U"x",38},{U"z",39},{U"¡",40},{U"«",41},{U"»",42},{U"¿",43},{U"æ",44},{U"ç",45},{U"ð",46},{U"ŋ",47},{U"ɐ",48},{U"ɑ",49},{U"ɔ",50},{U"ɕ",51},{U"ə",52},{U"ɚ",53},{U"ɛ",54},{U"ɜ",55},{U"ɟ",56},{U"ɡ",57},{U"ɪ",58},{U"ɬ",59},{U"ɯ",60},{U"ɹ",61},{U"ɾ",62},{U"ʃ",63},{U"ʈ",64},{U"ʊ",65},{U"ʋ",66},{U"ʌ",67},{U"ʑ",68},{U"ʒ",69},{U"ʔ",70},{U"ʲ",71},{U"ˈ",72},{U"ˌ",73},{U"ː",74},{U"̃",75},{U"̩",76},{U"θ",77},{U"ᵻ",78},{U"…",82},{U"ˈɛ",83},{U"iː",84},{U"aɪ",85},{U"nd",86},{U"ˈɪ",87},{U"eɪ",88},{U"ˈæ",89},{U"ðə",90},{U"oʊ",91},{U"ɑː",92},{U"ˈeɪ",93},{U"ən",94},{U"uː",95},{U"ˈʌ",96},{U"ˈaɪ",97},{U"st",98},{U"ˈɔ",99},{U"ˈoʊ",100},{U"ˈiː",101},{U"ˈɑː",102},{U"ænd",103},{U"ːɹ",104},{U"ɪŋ",105},{U"ɜː",106},{U"ɪn",107},{U"tə",108},{U"ʌv",109},{U"aʊ",110},{U"əl",111},{U"ˈuː",112},{U"tʃ",113},{U"ɪz",114},{U"ˈɜː",115},{U"ˌʌ",116},{U"æt",117},{U"dʒ",118},{U"ˈɔː",119},{U"ɪt",120},{U"ˈaʊ",121},{U"ɚɹ",122},{U"ˈɛn",123},{U"wʌ",124},{U"li",125},{U"hiː",126},{U"ˌɛ",127},{U"wɪ",128},{U"wʌz",129},{U"ðæt",130},{U"juː",131},{U"oːɹ",132},{U"ðɪ",133},{U"sˈɛ",134},{U"ˌɪ",135},{U"ˈɑːɹ",136},{U"nt",137},{U"ˈʊ",138},{U"ənt",139},{U"hɪz",140},{U"ˌɑː",141},{U"hæ",142},{U"ɔːɹ",143},{U"ˈɛɹ",144},{U"wɪð",145},{U"ᵻd",146},{U"ˈoːɹ",147},{U"pɹ",148},{U"ˈɔːl",149},{U"mˌ",150},{U"ʃən",151},{U"kt",152},{U"ˌoʊ",153},{U"ˈɔːɹ",154},{U"fɹ",155},{U"æz",156},{U"ˌʌt",157},{U"ʃiː",158},{U"ˈɛl",159},{U"ˌaʊ",160},{U"ˈʌn",161},{U"əs",162},{U"hɜː",163},{U"lˈaɪ",164},{U"ˈæn",165},{U"ˈɪɹ",166},{U"ʊd",167},{U"ɹᵻ",168},{U"ld",169},{U"bˌʌt",170},{U"ks",171},{U"nˈoʊ",172},{U"hæd",173},{U"ɾɚ",174},{U"ɛɹ",175},{U"ˈɪŋ",176},{U"ɡɹ",177},{U"nˌɑː",178},{U"ɔn",179},{U"vɚ",180},{U"maɪ",181},{U"fɔːɹ",182},{U"ðɚ",183},{U"tʊ",184},{U"ðɛɹ",185},{U"nˌɑːt",186},{U"ˈʌm",187},{U"tɹ",188},{U"sˈiː",189},{U"ʌvðə",190},{U"mˈɪ",191},{U"hˈæ",192},{U"ˌɪm",193},{U"lˈeɪ",194},{U"ɪk",195},{U"sp",196},{U"hˌɪm",197},{U"ɐn",198},{U"ðeɪ",199},{U"lˈɪ",200},{U"ɾi",201},{U"lˈɛ",202},{U"bɹ",203},{U"kɹ",204},{U"lˈæ",205},{U"ˈɪl",206},{U"jˈuː",207},{U"ʌm",208},{U"mˌiː",209},{U"bᵻ",210},{U"wˈʌn",211},{U"ˌɪn",212},{U"ˈɪn",213},{U"ˈoʊn",214},{U"sˈɛd",215},{U"biː",216},{U"ˈɛd",217},{U"ˈaɪt",218},{U"baɪ",219},{U"fɹʌm",220},{U"ɪs",221},{U"ɚz",222},{U"ðɪs",223},{U"əns",224},{U"bəl",225},{U"ɪf",226},{U"ɪnðə",227},{U"əm",228},{U"ᵻz",229},{U"ˌuː",230},{U"wˈeɪ",231},{U"ft",232},{U"wiː",233},{U"stɹ",234},{U"lˈiː",235},{U"iːz",236},{U"pt",237},{U"jʊ",238},{U"ɚd",239},{U"ˌaɪ",240},{U"kw",241},{U"ˌɔn",242},{U"ˈaɪd",243},{U"ɪm",244},{U"ˈʌst",245},{U"ˈoʊld",246},{U"ts",247},{U"ˌɪtʃ",248},{U"sˌoʊ",249},{U"dˈɪ",250},{U"ɑːɹ",251},{U"hɐ",252},{U"sˈeɪ",253},{U"ɾᵻd",254},{U"wˌɪtʃ",255},
+};
+
+std::vector<merge_entry_t> vocab_merges = {
+	{U"ˈ", U"ɛ"},{U"i", U"ː"},{U"a", U"ɪ"},{U"n", U"d"},{U"ˈ", U"ɪ"},{U"e", U"ɪ"},{U"ˈ", U"æ"},{U"ð", U"ə"},{U"o", U"ʊ"},{U"ɑ", U"ː"},{U"ˈ", U"eɪ"},{U"ə", U"n"},{U"u", U"ː"},{U"ˈ", U"ʌ"},{U"ˈ", U"aɪ"},{U"s", U"t"},{U"ˈ", U"ɔ"},{U"ˈ", U"oʊ"},{U"ˈ", U"iː"},{U"ˈ", U"ɑː"},{U"æ", U"nd"},{U"ː", U"ɹ"},{U"ɪ", U"ŋ"},{U"ɜ", U"ː"},{U"ɪ", U"n"},{U"t", U"ə"},{U"ʌ", U"v"},{U"a", U"ʊ"},{U"ə", U"l"},{U"ˈ", U"uː"},{U"t", U"ʃ"},{U"ɪ", U"z"},{U"ˈ", U"ɜː"},{U"ˌ", U"ʌ"},{U"æ", U"t"},{U"d", U"ʒ"},{U"ˈɔ", U"ː"},{U"ɪ", U"t"},{U"ˈ", U"aʊ"},{U"ɚ", U"ɹ"},{U"ˈɛ", U"n"},{U"w", U"ʌ"},{U"l", U"i"},{U"h", U"iː"},{U"ˌ", U"ɛ"},{U"w", U"ɪ"},{U"wʌ", U"z"},{U"ð", U"æt"},{U"j", U"uː"},{U"o", U"ːɹ"},{U"ð", U"ɪ"},{U"s", U"ˈɛ"},{U"ˌ", U"ɪ"},{U"ˈɑː", U"ɹ"},{U"n", U"t"},{U"ˈ", U"ʊ"},{U"ən", U"t"},{U"h", U"ɪz"},{U"ˌ", U"ɑː"},{U"h", U"æ"},{U"ɔ", U"ːɹ"},{U"ˈɛ", U"ɹ"},{U"wɪ", U"ð"},{U"ᵻ", U"d"},{U"ˈ", U"oːɹ"},{U"p", U"ɹ"},{U"ˈɔː", U"l"},{U"m", U"ˌ"},{U"ʃ", U"ən"},{U"k", U"t"},{U"ˌ", U"oʊ"},{U"ˈɔ", U"ːɹ"},{U"f", U"ɹ"},{U"æ", U"z"},{U"ˌʌ", U"t"},{U"ʃ", U"iː"},{U"ˈɛ", U"l"},{U"ˌ", U"aʊ"},{U"ˈʌ", U"n"},{U"ə", U"s"},{U"h", U"ɜː"},{U"l", U"ˈaɪ"},{U"ˈæ", U"n"},{U"ˈɪ", U"ɹ"},{U"ʊ", U"d"},{U"ɹ", U"ᵻ"},{U"l", U"d"},{U"b", U"ˌʌt"},{U"k", U"s"},{U"n", U"ˈoʊ"},{U"hæ", U"d"},{U"ɾ", U"ɚ"},{U"ɛ", U"ɹ"},{U"ˈɪ", U"ŋ"},{U"ɡ", U"ɹ"},{U"n", U"ˌɑː"},{U"ɔ", U"n"},{U"v", U"ɚ"},{U"m", U"aɪ"},{U"f", U"ɔːɹ"},{U"ð", U"ɚ"},{U"t", U"ʊ"},{U"ð", U"ɛɹ"},{U"nˌɑː", U"t"},{U"ˈʌ", U"m"},{U"t", U"ɹ"},{U"s", U"ˈiː"},{U"ʌv", U"ðə"},{U"m", U"ˈɪ"},{U"h", U"ˈæ"},{U"ˌɪ", U"m"},{U"l", U"ˈeɪ"},{U"ɪ", U"k"},{U"s", U"p"},{U"h", U"ˌɪm"},{U"ɐ", U"n"},{U"ð", U"eɪ"},{U"l", U"ˈɪ"},{U"ɾ", U"i"},{U"l", U"ˈɛ"},{U"b", U"ɹ"},{U"k", U"ɹ"},{U"l", U"ˈæ"},{U"ˈɪ", U"l"},{U"j", U"ˈuː"},{U"ʌ", U"m"},{U"mˌ", U"iː"},{U"b", U"ᵻ"},{U"w", U"ˈʌn"},{U"ˌ", U"ɪn"},{U"ˈɪ", U"n"},{U"ˈoʊ", U"n"},{U"sˈɛ", U"d"},{U"b", U"iː"},{U"ˈɛ", U"d"},{U"ˈaɪ", U"t"},{U"b", U"aɪ"},{U"fɹ", U"ʌm"},{U"ɪ", U"s"},{U"ɚ", U"z"},{U"ðɪ", U"s"},{U"ən", U"s"},{U"b", U"əl"},{U"ɪ", U"f"},{U"ɪn", U"ðə"},{U"ə", U"m"},{U"ᵻ", U"z"},{U"ˌ", U"uː"},{U"w", U"ˈeɪ"},{U"f", U"t"},{U"w", U"iː"},{U"st", U"ɹ"},{U"l", U"ˈiː"},{U"iː", U"z"},{U"p", U"t"},{U"j", U"ʊ"},{U"ɚ", U"d"},{U"ˌ", U"aɪ"},{U"k", U"w"},{U"ˌ", U"ɔn"},{U"ˈaɪ", U"d"},{U"ɪ", U"m"},{U"ˈʌ", U"st"},{U"ˈoʊ", U"ld"},{U"t", U"s"},{U"ˌɪ", U"tʃ"},{U"s", U"ˌoʊ"},{U"d", U"ˈɪ"},{U"ɑː", U"ɹ"},{U"h", U"ɐ"},{U"s", U"ˈeɪ"},{U"ɾ", U"ᵻd"},{U"w", U"ˌɪtʃ"},
+};
+std::unordered_map<std::string, merge_entry_t> vocab_merge_map = {};
+
 std::vector<float> VALL_E_API read_2d_tensor( struct ggml_tensor* tensor ) {
 	size_t size = tensor->ne[0] * tensor->ne[1];
 	std::vector<float> res( size );
@ -109,11 +124,11 @@ void VALL_E_API vall_e_inputs_map_init( io_map_t& io_map, llama_model* model ) {

 	int32_t ctx_size = 24 * 2 * ggml_tensor_overhead(); // 24 embeddings + 24 output heads (generous) (should only really need to do this for output heads since we manually handle embeddings)
 	struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true,
-    };
-    io_map.ctx = ggml_init(params);
+		/*.mem_size   =*/ ctx_size,
+		/*.mem_buffer =*/ NULL,
+		/*.no_alloc   =*/ true,
+	};
+	io_map.ctx = ggml_init(params);

 // to-do: figure a nicer way to do this
 #if LLAMA_CPP_USE_VALL_E_ARCH
@ -207,72 +222,72 @@ void VALL_E_API batch_add( llama_batch& batch, token_t id, int n_embd, const flo
 std::vector<float> VALL_E_API read_audio_from_disk( const std::string& path ) {
 	std::vector<float> res;

-    uint32_t channels;
-    uint32_t sample_rate;
-    drwav_uint64 total_frame_count;
+	uint32_t channels;
+	uint32_t sample_rate;
+	drwav_uint64 total_frame_count;

-    float * raw_audio = drwav_open_file_and_read_pcm_frames_f32(path.c_str(), &channels, &sample_rate, &total_frame_count, NULL);
+	float * raw_audio = drwav_open_file_and_read_pcm_frames_f32(path.c_str(), &channels, &sample_rate, &total_frame_count, NULL);

-    if (raw_audio == NULL) {
-        fprintf(stderr, "%s: could not read wav file\n", __func__);
-        return res;
-    }
+	if (raw_audio == NULL) {
+		fprintf(stderr, "%s: could not read wav file\n", __func__);
+		return res;
+	}

-    if (sample_rate != 24000) {
-        fprintf(stderr, "%s: wav file is wrong sample rate\n", __func__);
-        return res;
-    }
+	if (sample_rate != 24000) {
+		fprintf(stderr, "%s: wav file is wrong sample rate\n", __func__);
+		return res;
+	}

-    fprintf(stderr, "\n%s: Number of frames read = %lld.\n", __func__, total_frame_count);
+	fprintf(stderr, "\n%s: Number of frames read = %lld.\n", __func__, total_frame_count);

-    res.resize(total_frame_count);
-    memcpy(res.data(), raw_audio, total_frame_count * sizeof(float));
+	res.resize(total_frame_count);
+	memcpy(res.data(), raw_audio, total_frame_count * sizeof(float));

-    drwav_free(raw_audio, NULL);
+	drwav_free(raw_audio, NULL);

-    return res;
+	return res;
 }
 // writes a waveform to disk
 void VALL_E_API write_audio_to_disk( const std::vector<float>& wavform, const std::string& path ) {
-    drwav_data_format format;
-    format.bitsPerSample = 32;
-    format.sampleRate = 24000;
-    format.container = drwav_container_riff;
-    format.channels = 1;
-    format.format = DR_WAVE_FORMAT_IEEE_FLOAT;
+	drwav_data_format format;
+	format.bitsPerSample = 32;
+	format.sampleRate = 24000;
+	format.container = drwav_container_riff;
+	format.channels = 1;
+	format.format = DR_WAVE_FORMAT_IEEE_FLOAT;

-    drwav wav;
-    drwav_init_file_write(&wav, path.c_str(), &format, NULL);
-    drwav_uint64 frames = drwav_write_pcm_frames(&wav, wavform.size(), wavform.data());
-    drwav_uninit(&wav);
+	drwav wav;
+	drwav_init_file_write(&wav, path.c_str(), &format, NULL);
+	drwav_uint64 frames = drwav_write_pcm_frames(&wav, wavform.size(), wavform.data());
+	drwav_uninit(&wav);

-    fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames);
+	fprintf(stderr, "%s: Number of frames written = %lld.\n", __func__, frames);
 }
 // reads a waveform from disk then encodes it
 std::vector<std::vector<int32_t>> VALL_E_API encode_audio( struct encodec_context* ectx, const std::vector<float>& wavform ) {
-    // compress audio
-    if (!encodec_compress_audio(ectx, wavform.data(), wavform.size(), 1)) {
-        fprintf(stderr, "%s: error during compression \n", __func__);
-        return {};
-    }
+	// compress audio
+	if (!encodec_compress_audio(ectx, wavform.data(), wavform.size(), 1)) {
+		fprintf(stderr, "%s: error during compression \n", __func__);
+		return {};
+	}

-    int32_t* codes_data = encodec_get_codes( ectx );
-    int n_codes = encodec_get_codes_size( ectx );
-    int n_codebooks = 8;
-    int n_frames = n_codes / n_codebooks;
-    
-    std::vector<std::vector<int32_t>> res(n_codebooks);
+	int32_t* codes_data = encodec_get_codes( ectx );
+	int n_codes = encodec_get_codes_size( ectx );
+	int n_codebooks = 8;
+	int n_frames = n_codes / n_codebooks;
+	
+	std::vector<std::vector<int32_t>> res(n_codebooks);

-    for ( auto l = 0; l < n_codebooks; ++l ) {
-    	res[l].insert( res[l].end(), codes_data + (l * n_frames), codes_data + ((l+1) * n_frames) );
-    }
+	for ( auto l = 0; l < n_codebooks; ++l ) {
+		res[l].insert( res[l].end(), codes_data + (l * n_frames), codes_data + ((l+1) * n_frames) );
+	}

-    return res;
+	return res;
 }
 // decodes a 2D codebook into a waveform
 std::vector<float> VALL_E_API decode_audio( struct encodec_context* ectx, const std::vector<std::vector<int32_t>>& codes ) {
-    int n_codebooks = codes.size();
-    int n_frames = codes[0].size();
+	int n_codebooks = codes.size();
+	int n_frames = codes[0].size();
 	

 	std::vector<int32_t> res;
@ -282,16 +297,16 @@ std::vector<float> VALL_E_API decode_audio( struct encodec_context* ectx, const
 		res.insert( res.end(), codes[l].begin(), codes[l].end() );
 	}

-    // decompress audio
-    if (!encodec_decompress_audio(ectx, res.data(), res.size(), N_THREADS)) {
-        fprintf(stderr, "%s: error during decompression\n", __func__);
-        return {};
-    }
+	// decompress audio
+	if (!encodec_decompress_audio(ectx, res.data(), res.size(), N_THREADS)) {
+		fprintf(stderr, "%s: error during decompression\n", __func__);
+		return {};
+	}

-    // write reconstructed audio on disk
-    const float* audio_data = encodec_get_audio(ectx);
-    const int audio_size = encodec_get_audio_size(ectx);
-    return std::vector<float>(audio_data, audio_data + audio_size);
+	// write reconstructed audio on disk
+	const float* audio_data = encodec_get_audio(ectx);
+	const int audio_size = encodec_get_audio_size(ectx);
+	return std::vector<float>(audio_data, audio_data + audio_size);
 }

 // sums embeddings over a 2D "tensor"
@ -484,7 +499,7 @@ std::vector<token_t> VALL_E_API generate( vall_e_context_t* ctx, vall_e_inputs_t
 	// to-do: figure this out......
 	{
 		llama_set_causal_attn( ctx->llama.ctx, causal ); // to-do: fix GGML_ASSERT(mask->ne[0] == a->ne[0])
-    //	*const_cast<bool*>(&model->hparams.causal_attn) = true; // force set this
+	//	*const_cast<bool*>(&model->hparams.causal_attn) = true; // force set this
 	}

 	std::vector<token_t> output_tokens;
@ -702,20 +717,78 @@ std::vector<token_t> VALL_E_API generate( vall_e_context_t* ctx, vall_e_inputs_t
 	return output_tokens;
 }

-std::vector<token_t> VALL_E_API phonemize( vall_e_context_t* ctx, const std::string& text, const std::string& language ) {
-	return {1,22,111,100,4,37,115,169,11,2}; // <bos>hˈɛloː ʋˈɔrlt</eos>
-/*
-	const int n_prompt = -llama_tokenize(model, inputs.phonemes.c_str(), inputs.phonemes.size(), NULL, 0, true, true);
-	// allocate space for the tokens and tokenize the inputs.phonemes
-	inputs.phn.resize(n_prompt);
-	if (llama_tokenize(model, inputs.phonemes.c_str(), inputs.phonemes.size(), inputs.phn.data(), inputs.phn.size(), true, true) < 0) {
-	    fprintf(stderr, "%s: error: failed to tokenize: %s\n", __func__, inputs.phonemes.c_str());
-	    return 1;
+std::string string_replace( const std::string& string, const std::string& search, const std::string& replace ) {
+	std::string res = string;
+	size_t start_pos;
+	while ( (start_pos = res.find(search)) != std::string::npos ) {
+		res.replace(start_pos, search.length(), replace);
 	}
+	return res;
+}

-	for ( auto& token : inputs.phn ) printf("%i ", token );
-	printf("\n");
-*/
+std::vector<token_t> VALL_E_API phonemize( vall_e_context_t* ctx, const std::string& text, const std::string& language ) {	
+	std::vector<token_t> tokens;
+
+	// phonemize text
+	std::string espeak_language = "en";
+	if ( language == "en" ) espeak_language = "en-us";
+	else if ( language == "fr" ) espeak_language = "fr-fr";
+	else if ( language == "zh" ) espeak_language = "cmn-latn-pinyin";
+	espeak_SetVoiceByName(espeak_language.c_str());
+
+	const char* text_c_str = text.c_str();
+	const char* phonemes = espeak_TextToPhonemes((const void**) &text_c_str, espeakCHARS_UTF8, espeakPHONEMES_IPA);
+
+	std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv_utf8_utf32;
+	std::u32string unicode_phonemes = conv_utf8_utf32.from_bytes(phonemes);
+
+	// manual tokenization because llama tokenizer isn't cooperating
+	// to-do: handle merges
+	tokens.emplace_back(1);
+	for (auto& phone : unicode_phonemes ) {
+		std::u32string phone_str;
+		phone_str += phone;
+		// place <unk> first
+		auto& token = tokens.emplace_back(0);
+		// update if found
+		if ( vocab.count( phone_str ) > 0 ) {
+			token = vocab[phone_str];
+		}
+	}
+	
+	// handle merges (skip <bos>)
+	for ( auto i = 1; i < tokens.size() - 1; ++i ) {
+		auto& cur = tokens[i];
+		auto& next = tokens[i+1];
+		std::string key = std::to_string(cur) + ":" + std::to_string(next);
+		// not a merge
+		if ( !vocab_merge_map.count(key) )
+			continue;
+
+		// get merge entry
+		auto& merge = vocab_merge_map[key];
+		// update with merged token
+		cur = merge.resolved_token;
+		// erase at next token
+		tokens.erase(tokens.begin() + i + 1);
+		// back iterate to check for more merges at next iteration
+		--i;
+	}
+	tokens.emplace_back(2);
+
+
+	/*
+	// to-do: fix terminate called after throwing an instance of 'std::out_of_range'
+	// deduce token count
+	const int n_tokens = -llama_tokenize(ctx->llama.model, phonemes.c_str(), phonemes.size(), NULL, 0, true, true);
+	tokens.resize(n_tokens);
+	// tokenize
+	if ( llama_tokenize(ctx->llama.model, phonemes.c_str(), phonemes.size(), tokens.data(), tokens.size(), true, true) < 0 ) {
+		fprintf(stderr, "%s: error: failed to tokenize: %s\n", __func__, phonemes.c_str());
+		return tokens;
+	}
+	*/
+	return tokens;
 }

 vall_e_context_t* VALL_E_API vall_e_load( const vall_e_context_params_t& params ) {
@ -750,8 +823,8 @@ vall_e_context_t* VALL_E_API vall_e_load( const vall_e_context_params_t& params
 		fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__);
 		return ctx;
 	}
-    
-    // setup encodec.cpp
+	
+	// setup encodec.cpp
 	ctx->encodec.ctx = encodec_load_model(params.encodec_path.c_str(), 0, params.gpu_layers);
 	if ( !ctx->encodec.ctx ) {
 		fprintf(stderr, "%s: error during loading model\n", __func__);
@ -760,9 +833,24 @@ vall_e_context_t* VALL_E_API vall_e_load( const vall_e_context_params_t& params
 	encodec_set_target_bandwidth(ctx->encodec.ctx, 6);
 	encodec_set_sample_rate(ctx->encodec.ctx, 24000);

+	// setup espeak
+	espeak_Initialize(AUDIO_OUTPUT_SYNCHRONOUS, 0, NULL, 0);
+
 	// setup vall_e.cpp
 	vall_e_inputs_map_init( ctx->io_map, ctx->llama.model );

+	// setup vocab things
+	for ( auto& entry : vocab_merges ) {
+		entry.resolved = entry.pre+entry.post;
+
+		entry.pre_token = vocab[entry.pre];
+		entry.post_token = vocab[entry.post];
+		entry.resolved_token = vocab[entry.resolved];
+
+		std::string key = std::to_string(entry.pre_token) + ":" + std::to_string(entry.post_token);	
+		vocab_merge_map[key] = entry;
+	}
+
 	return ctx;
 }
 vall_e_inputs_t vall_e_prepare_inputs( vall_e_context_t* ctx, const std::string& text, const std::string& prompt_path, const std::string& language ) {
@ -785,7 +873,7 @@ vall_e_audio_codes_t vall_e_generate( vall_e_context_t* ctx, vall_e_inputs_t& in
 	std::vector<token_t> output_tokens;
 	if ( modality == MODALITY_NAR_LEN ) {
 		// inference len
-		int len = 75;
+		int len = 0;
 		if ( !len ) {
 			inputs.task = "len";
 			output_tokens = generate( ctx, inputs, 5, INFERENCE_MODE_LEN );
@ -826,6 +914,7 @@ vall_e_audio_codes_t vall_e_generate( vall_e_context_t* ctx, vall_e_inputs_t& in
 	return inputs.resp;
 }
 void VALL_E_API vall_e_free( vall_e_context_t* ctx ) {
+	espeak_Terminate();
 	encodec_free(ctx->encodec.ctx);
 	llama_free(ctx->llama.ctx);
 	llama_free_model(ctx->llama.model);
@ -843,12 +932,13 @@ int main( int argc, char** argv ) {
 	params.cpu_threads = N_THREADS;
 	vall_e_context_t* ctx = vall_e_load( params );

+	std::string text = "Hello world.";
 	std::string prompt_path = "./data/prom.wav";
 	std::string output_path = "./data/resp.wav";
 	std::string language = "en";
 	int modality = MODALITY_NAR_LEN;

-	auto inputs = vall_e_prepare_inputs( ctx, "Hello world.", prompt_path, language );
+	auto inputs = vall_e_prepare_inputs( ctx, text, prompt_path, language );
 	auto output_audio_codes = vall_e_generate( ctx, inputs, modality );
 	write_audio_to_disk( decode_audio( ctx->encodec.ctx, output_audio_codes ), output_path );

--- a/vall_e.cpp/vall_e.h
+++ b/vall_e.cpp/vall_e.h
@ -1,14 +1,16 @@
 #pragma once

-#include "llama.h"
-#include "encodec.h"
-
-#include "dr_wav.h"
-
+// C++ deps
 #include <string>
 #include <vector>
 #include <unordered_map>

+// external deps
+#include <llama.h>
+#include <encodec.h>
+#include <dr_wav.h>
+#include <espeak-ng/speak_lib.h>
+
 // to-do: copy over import/export stuff from engine project (because I don't remember how I set it up in <uf/config.h>)
 #define VALL_E_API

@ -73,6 +75,16 @@ struct score_t {
 	bool operator<( const score_t& that ) const { return this->value < that.value; }
 };

+struct merge_entry_t {
+	std::u32string pre;
+	std::u32string post;
+	std::u32string resolved;
+
+	token_t pre_token;
+	token_t post_token;
+	token_t resolved_token;
+};
+
 struct vall_e_context_params_t {
 	std::string model_path;
 	std::string encodec_path;