vall_e.cpp phonemizing and tokenizing

This commit is contained in:
mrq 2024-12-24 22:39:32 -06:00
parent 8516bab15c
commit 6bf59bbd8b
36 changed files with 16032 additions and 86 deletions

View File

@ -3,7 +3,7 @@ CXX = g++
INCS += -I./include
LIBS += -L./libs
LINKS += -lggml -lggml-base -lllama -lencodec
LINKS += -lggml -lggml-base -lllama -lencodec -lespeak-ng
FLAGS += -march=native -O3
SRCS := $(shell find ./ -name "*.cpp")

View File

@ -8,7 +8,7 @@ At the moment it's ***very*** work in progress.
Populate `./include/` with the `ggml`, `llama.cpp`, and `encodec.cpp` headers.
Populate `./libs/` with the compiled libraries of `llama.cpp` and `encodec.cpp`.
Populate `./libs/` with the compiled libraries of `llama.cpp`, `encodec.cpp`, and `espeak-ng`.
Run `make`.
@ -26,10 +26,10 @@ Run `make`.
* [x] load the quantized model
* [x] orchestrate the required embeddings
* [x] juggle the output head / classifier properly
* [ ] phonemize text
* [x] phonemize text
* with the help of espeak-ng
* [ ] tokenize phonemes
* the tokenizer is being a huge thorn on actual sequences
* [x] tokenize phonemes
* tokenize with `llama_tokenize` instead of a homebrewed method because the tokenizer is being a huge thorn
* [x] load audio from disk
* [x] encode audio
* [x] sum embeddings for the `prom` and prior `resp`s

View File

@ -0,0 +1,113 @@
#pragma once
#include <vector>
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "lstm.h"
#include "utils.h"
struct encodec_decoder_block {
// upsampling layers
struct ggml_tensor *us_conv_w;
struct ggml_tensor *us_conv_b;
// conv1
struct ggml_tensor *conv_1_w;
struct ggml_tensor *conv_1_b;
// conv2
struct ggml_tensor *conv_2_w;
struct ggml_tensor *conv_2_b;
// shortcut
struct ggml_tensor *conv_sc_w;
struct ggml_tensor *conv_sc_b;
};
struct encodec_decoder {
struct ggml_tensor *init_conv_w;
struct ggml_tensor *init_conv_b;
encodec_lstm lstm;
struct ggml_tensor *final_conv_w;
struct ggml_tensor *final_conv_b;
std::vector<encodec_decoder_block> blocks;
};
struct ggml_tensor *encodec_forward_decoder(
const struct encodec_decoder *decoder, struct ggml_context *ctx0,
struct ggml_tensor *quantized_out, const int *ratios, const int kernel_size, const int res_kernel_size,
const int stride) {
if (!quantized_out) {
fprintf(stderr, "%s: null input tensor\n", __func__);
return NULL;
}
struct ggml_tensor *inpL = strided_conv_1d(
ctx0, quantized_out, decoder->init_conv_w, decoder->init_conv_b, stride);
// lstm
{
struct ggml_tensor *cur = inpL;
const encodec_lstm lstm = decoder->lstm;
// first lstm layer
char l0_prefix[7] = "dec_l0";
struct ggml_tensor *hs1 = forward_pass_lstm_unilayer(
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b, l0_prefix);
// second lstm layer
char l1_prefix[7] = "dec_l1";
struct ggml_tensor *out = forward_pass_lstm_unilayer(
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b, l1_prefix);
inpL = ggml_add(ctx0, inpL, out);
}
for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
encodec_decoder_block block = decoder->blocks[layer_ix];
// upsampling layers
inpL = ggml_elu(ctx0, inpL);
inpL = strided_conv_transpose_1d(
ctx0, inpL, block.us_conv_w, block.us_conv_b, ratios[layer_ix]);
struct ggml_tensor *current = inpL;
// shortcut
struct ggml_tensor *shortcut = strided_conv_1d(
ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
// conv1
current = ggml_elu(ctx0, current);
current = strided_conv_1d(
ctx0, current, block.conv_1_w, block.conv_1_b, stride);
// conv2
current = ggml_elu(ctx0, current);
current = strided_conv_1d(
ctx0, current, block.conv_2_w, block.conv_2_b, stride);
// residual connection
inpL = ggml_add(ctx0, current, shortcut);
}
// final conv
inpL = ggml_elu(ctx0, inpL);
struct ggml_tensor *decoded_inp = strided_conv_1d(
ctx0, inpL, decoder->final_conv_w, decoder->final_conv_b, stride);
return decoded_inp;
}

6434
vall_e.cpp/include/dr_wav.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,184 @@
/*
Copyright 2024 Pierre-Antoine Bannier
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
/*
* This file contains the declarations of the structs and functions used in the encodec library.
* The library provides functionality for audio compression and decompression using a custom model.
* The model consists of an encoder, a quantizer and a decoder, each with their own set of parameters.
* The library also provides functions for loading and freeing the model, as well as compressing and decompressing audio data.
*
*/
#pragma once
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
struct encodec_context;
struct encodec_statistics {
// The time taken to load the model.
int64_t t_load_us;
// The time taken to compute the model.
int64_t t_compute_us;
};
/**
* Loads an encodec model from the specified file path.
*
* @param model_path The file path to the encodec model.
* @param offset The offset (in bytes) to the start of the model in the file.
* @param n_gpu_layers The number of GPU layers to use.
* @return A pointer to the encodec context struct.
*/
struct encodec_context *encodec_load_model(
const char *model_path,
const int offset,
int n_gpu_layers);
/**
* Sets the target bandwidth for the given encodec context.
*
* @param ectx The encodec context to set the target bandwidth for.
* @param bandwidth The target bandwidth to set, in bits per second.
*/
void encodec_set_target_bandwidth(
struct encodec_context *ectx,
int bandwidth);
/**
* Sets the sample rate for the given encodec context.
*
* @param ectx The encodec context to set the target bandwidth for.
* @param sample_rate The sample rate to set.
*/
void encodec_set_sample_rate(
struct encodec_context *ectx,
int sample_rate);
/**
* Reconstructs audio from raw audio data using the specified encodec context.
*
* @param ectx The encodec context to use for reconstruction.
* @param raw_audio The raw audio data to reconstruct.
* @param n_samples The number of samples in the raw audio buffer.
* @param n_threads The number of threads to use for reconstruction.
* @return True if the reconstruction was successful, false otherwise.
*/
bool encodec_reconstruct_audio(
struct encodec_context *ectx,
const float *raw_audio,
const int n_samples,
int n_threads);
/**
* Compresses audio data using the specified encodec context.
*
* @param ectx The encodec context to use for compression.
* @param raw_audio The raw audio data to compress.
* @param n_samples The number of samples in the raw audio buffer.
* @param n_threads The number of threads to use for compression.
* @return True if the compression was successful, false otherwise.
*/
bool encodec_compress_audio(
struct encodec_context *ectx,
const float *raw_audio,
const int n_samples,
int n_threads);
/**
* Decompresses audio data using the specified encodec context.
*
* @param ectx The encodec context to use for decompression.
* @param codes The compressed audio data to decompress.
* @param n_codes The number of codes in the codes buffer.
* @param n_threads The number of threads to use for decompression.
* @return True if the audio data was successfully decompressed, false otherwise.
*/
bool encodec_decompress_audio(
struct encodec_context *ectx,
const int32_t *codes,
const int n_codes,
int n_threads);
/**
* Gets the audio data from the given encodec context.
*
* @param ectx The encodec context to get the audio data from.
* @return A pointer to the audio data.
*/
float * encodec_get_audio(
struct encodec_context *ectx);
/**
* Gets the size of the audio data from the given encodec context.
*
* @param ectx The encodec context to get the audio size from.
* @return The size of the audio data.
*/
int encodec_get_audio_size(
struct encodec_context *ectx);
/**
* Gets the code data from the given encodec context.
*
* @param ectx The encodec context to get the code data from.
* @return A pointer to the code data.
*/
int32_t * encodec_get_codes(
struct encodec_context *ectx);
/**
* Gets the size of the code data from the given encodec context.
*
* @param ectx The encodec context to get the code size from.
* @return The size of the code data.
*/
int encodec_get_codes_size(
struct encodec_context *ectx);
/**
* Gets the statistics for the given encodec context.
*
* @param ectx The encodec context to get the statistics for.
* @return A pointer to the statistics struct.
*/
const struct encodec_statistics* encodec_get_statistics(
struct encodec_context *ectx);
/**
* Reset the statistics for the given encodec context.
*
* @param ectx The encodec context to reset the statistics for.
*/
void encodec_reset_statistics(
struct encodec_context *ectx);
/**
* @brief Frees the memory allocated for an encodec context.
*
* @param ectx The encodec context to free.
*/
void encodec_free(
struct encodec_context *ectx);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,109 @@
#pragma once
#include <vector>
#include "ggml.h"
#include "lstm.h"
// res + downsample block at some ratio
struct encodec_encoder_block {
// conv1
struct ggml_tensor *conv_1_w;
struct ggml_tensor *conv_1_b;
// conv2
struct ggml_tensor *conv_2_w;
struct ggml_tensor *conv_2_b;
// shortcut
struct ggml_tensor *conv_sc_w;
struct ggml_tensor *conv_sc_b;
// downsampling layers
struct ggml_tensor *ds_conv_w;
struct ggml_tensor *ds_conv_b;
};
struct encodec_encoder {
struct ggml_tensor *init_conv_w;
struct ggml_tensor *init_conv_b;
encodec_lstm lstm;
struct ggml_tensor *final_conv_w;
struct ggml_tensor *final_conv_b;
std::vector<encodec_encoder_block> blocks;
};
struct ggml_tensor *encodec_forward_encoder(
const struct encodec_encoder *encoder, struct ggml_context *ctx0,
struct ggml_tensor *inp, const int * ratios, const int kernel_size, const int res_kernel_size,
const int stride) {
if (!inp) {
fprintf(stderr, "%s: null input tensor\n", __func__);
return NULL;
}
struct ggml_tensor *inpL = strided_conv_1d(
ctx0, inp, encoder->init_conv_w, encoder->init_conv_b, stride);
for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
encodec_encoder_block block = encoder->blocks[layer_ix];
struct ggml_tensor *current = inpL;
// shortcut
struct ggml_tensor *shortcut = strided_conv_1d(
ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
// conv1
current = ggml_elu(ctx0, current);
current = strided_conv_1d(
ctx0, current, block.conv_1_w, block.conv_1_b, stride);