vall_e.cpp phonemizing and tokenizing
This commit is contained in:
parent
8516bab15c
commit
6bf59bbd8b
|
@ -3,7 +3,7 @@ CXX = g++
|
|||
INCS += -I./include
|
||||
LIBS += -L./libs
|
||||
|
||||
LINKS += -lggml -lggml-base -lllama -lencodec
|
||||
LINKS += -lggml -lggml-base -lllama -lencodec -lespeak-ng
|
||||
FLAGS += -march=native -O3
|
||||
|
||||
SRCS := $(shell find ./ -name "*.cpp")
|
||||
|
|
|
@ -8,7 +8,7 @@ At the moment it's ***very*** work in progress.
|
|||
|
||||
Populate `./include/` with the `ggml`, `llama.cpp`, and `encodec.cpp` headers.
|
||||
|
||||
Populate `./libs/` with the compiled libraries of `llama.cpp` and `encodec.cpp`.
|
||||
Populate `./libs/` with the compiled libraries of `llama.cpp`, `encodec.cpp`, and `espeak-ng`.
|
||||
|
||||
Run `make`.
|
||||
|
||||
|
@ -26,10 +26,10 @@ Run `make`.
|
|||
* [x] load the quantized model
|
||||
* [x] orchestrate the required embeddings
|
||||
* [x] juggle the output head / classifier properly
|
||||
* [ ] phonemize text
|
||||
* [x] phonemize text
|
||||
* with the help of espeak-ng
|
||||
* [ ] tokenize phonemes
|
||||
* the tokenizer is being a huge thorn on actual sequences
|
||||
* [x] tokenize phonemes
|
||||
* tokenize with `llama_tokenize` instead of a homebrewed method because the tokenizer is being a huge thorn
|
||||
* [x] load audio from disk
|
||||
* [x] encode audio
|
||||
* [x] sum embeddings for the `prom` and prior `resp`s
|
||||
|
|
113
vall_e.cpp/include/decoder.h
Normal file
113
vall_e.cpp/include/decoder.h
Normal file
|
@ -0,0 +1,113 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include "lstm.h"
|
||||
#include "utils.h"
|
||||
|
||||
|
||||
struct encodec_decoder_block {
|
||||
// upsampling layers
|
||||
struct ggml_tensor *us_conv_w;
|
||||
struct ggml_tensor *us_conv_b;
|
||||
|
||||
// conv1
|
||||
struct ggml_tensor *conv_1_w;
|
||||
struct ggml_tensor *conv_1_b;
|
||||
|
||||
// conv2
|
||||
struct ggml_tensor *conv_2_w;
|
||||
struct ggml_tensor *conv_2_b;
|
||||
|
||||
// shortcut
|
||||
struct ggml_tensor *conv_sc_w;
|
||||
struct ggml_tensor *conv_sc_b;
|
||||
};
|
||||
|
||||
struct encodec_decoder {
|
||||
struct ggml_tensor *init_conv_w;
|
||||
struct ggml_tensor *init_conv_b;
|
||||
|
||||
encodec_lstm lstm;
|
||||
|
||||
struct ggml_tensor *final_conv_w;
|
||||
struct ggml_tensor *final_conv_b;
|
||||
|
||||
std::vector<encodec_decoder_block> blocks;
|
||||
};
|
||||
|
||||
struct ggml_tensor *encodec_forward_decoder(
|
||||
const struct encodec_decoder *decoder, struct ggml_context *ctx0,
|
||||
struct ggml_tensor *quantized_out, const int *ratios, const int kernel_size, const int res_kernel_size,
|
||||
const int stride) {
|
||||
|
||||
if (!quantized_out) {
|
||||
fprintf(stderr, "%s: null input tensor\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct ggml_tensor *inpL = strided_conv_1d(
|
||||
ctx0, quantized_out, decoder->init_conv_w, decoder->init_conv_b, stride);
|
||||
|
||||
// lstm
|
||||
{
|
||||
struct ggml_tensor *cur = inpL;
|
||||
|
||||
const encodec_lstm lstm = decoder->lstm;
|
||||
|
||||
// first lstm layer
|
||||
char l0_prefix[7] = "dec_l0";
|
||||
struct ggml_tensor *hs1 = forward_pass_lstm_unilayer(
|
||||
ctx0, cur, lstm.l0_ih_w, lstm.l0_hh_w, lstm.l0_ih_b, lstm.l0_hh_b, l0_prefix);
|
||||
|
||||
// second lstm layer
|
||||
char l1_prefix[7] = "dec_l1";
|
||||
struct ggml_tensor *out = forward_pass_lstm_unilayer(
|
||||
ctx0, hs1, lstm.l1_ih_w, lstm.l1_hh_w, lstm.l1_ih_b, lstm.l1_hh_b, l1_prefix);
|
||||
|
||||
inpL = ggml_add(ctx0, inpL, out);
|
||||
}
|
||||
|
||||
for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
|
||||
encodec_decoder_block block = decoder->blocks[layer_ix];
|
||||
|
||||
// upsampling layers
|
||||
inpL = ggml_elu(ctx0, inpL);
|
||||
|
||||
inpL = strided_conv_transpose_1d(
|
||||
ctx0, inpL, block.us_conv_w, block.us_conv_b, ratios[layer_ix]);
|
||||
|
||||
struct ggml_tensor *current = inpL;
|
||||
|
||||
// shortcut
|
||||
struct ggml_tensor *shortcut = strided_conv_1d(
|
||||
ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
|
||||
|
||||
// conv1
|
||||
current = ggml_elu(ctx0, current);
|
||||
|
||||
current = strided_conv_1d(
|
||||
ctx0, current, block.conv_1_w, block.conv_1_b, stride);
|
||||
|
||||
// conv2
|
||||
current = ggml_elu(ctx0, current);
|
||||
|
||||
current = strided_conv_1d(
|
||||
ctx0, current, block.conv_2_w, block.conv_2_b, stride);
|
||||
|
||||
// residual connection
|
||||
inpL = ggml_add(ctx0, current, shortcut);
|
||||
}
|
||||
|
||||
// final conv
|
||||
inpL = ggml_elu(ctx0, inpL);
|
||||
|
||||
struct ggml_tensor *decoded_inp = strided_conv_1d(
|
||||
ctx0, inpL, decoder->final_conv_w, decoder->final_conv_b, stride);
|
||||
|
||||
return decoded_inp;
|
||||
}
|
6434
vall_e.cpp/include/dr_wav.h
Normal file
6434
vall_e.cpp/include/dr_wav.h
Normal file
File diff suppressed because it is too large
Load Diff
184
vall_e.cpp/include/encodec.h
Normal file
184
vall_e.cpp/include/encodec.h
Normal file
|
@ -0,0 +1,184 @@
|
|||
/*
|
||||
╞══════════════════════════════════════════════════════════════════════════════╡
|
||||
│ Copyright 2024 Pierre-Antoine Bannier │
|
||||
│ │
|
||||
│ Permission to use, copy, modify, and/or distribute this software for │
|
||||
│ any purpose with or without fee is hereby granted, provided that the │
|
||||
│ above copyright notice and this permission notice appear in all copies. │
|
||||
│ │
|
||||
│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL │
|
||||
│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED │
|
||||
│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE │
|
||||
│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL │
|
||||
│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR │
|
||||
│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER │
|
||||
│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR │
|
||||
│ PERFORMANCE OF THIS SOFTWARE. │
|
||||
╚─────────────────────────────────────────────────────────────────────────────*/
|
||||
/*
|
||||
* This file contains the declarations of the structs and functions used in the encodec library.
|
||||
* The library provides functionality for audio compression and decompression using a custom model.
|
||||
* The model consists of an encoder, a quantizer and a decoder, each with their own set of parameters.
|
||||
* The library also provides functions for loading and freeing the model, as well as compressing and decompressing audio data.
|
||||
*
|
||||
*/
|
||||
#pragma once
|
||||
|
||||
#include "ggml-alloc.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
struct encodec_context;
|
||||
|
||||
struct encodec_statistics {
|
||||
// The time taken to load the model.
|
||||
int64_t t_load_us;
|
||||
// The time taken to compute the model.
|
||||
int64_t t_compute_us;
|
||||
};
|
||||
|
||||
/**
|
||||
* Loads an encodec model from the specified file path.
|
||||
*
|
||||
* @param model_path The file path to the encodec model.
|
||||
* @param offset The offset (in bytes) to the start of the model in the file.
|
||||
* @param n_gpu_layers The number of GPU layers to use.
|
||||
* @return A pointer to the encodec context struct.
|
||||
*/
|
||||
struct encodec_context *encodec_load_model(
|
||||
const char *model_path,
|
||||
const int offset,
|
||||
int n_gpu_layers);
|
||||
|
||||
/**
|
||||
* Sets the target bandwidth for the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to set the target bandwidth for.
|
||||
* @param bandwidth The target bandwidth to set, in bits per second.
|
||||
*/
|
||||
void encodec_set_target_bandwidth(
|
||||
struct encodec_context *ectx,
|
||||
int bandwidth);
|
||||
|
||||
/**
|
||||
* Sets the sample rate for the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to set the target bandwidth for.
|
||||
* @param sample_rate The sample rate to set.
|
||||
*/
|
||||
void encodec_set_sample_rate(
|
||||
struct encodec_context *ectx,
|
||||
int sample_rate);
|
||||
|
||||
/**
|
||||
* Reconstructs audio from raw audio data using the specified encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to use for reconstruction.
|
||||
* @param raw_audio The raw audio data to reconstruct.
|
||||
* @param n_samples The number of samples in the raw audio buffer.
|
||||
* @param n_threads The number of threads to use for reconstruction.
|
||||
* @return True if the reconstruction was successful, false otherwise.
|
||||
*/
|
||||
bool encodec_reconstruct_audio(
|
||||
struct encodec_context *ectx,
|
||||
const float *raw_audio,
|
||||
const int n_samples,
|
||||
int n_threads);
|
||||
|
||||
/**
|
||||
* Compresses audio data using the specified encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to use for compression.
|
||||
* @param raw_audio The raw audio data to compress.
|
||||
* @param n_samples The number of samples in the raw audio buffer.
|
||||
* @param n_threads The number of threads to use for compression.
|
||||
* @return True if the compression was successful, false otherwise.
|
||||
*/
|
||||
bool encodec_compress_audio(
|
||||
struct encodec_context *ectx,
|
||||
const float *raw_audio,
|
||||
const int n_samples,
|
||||
int n_threads);
|
||||
|
||||
/**
|
||||
* Decompresses audio data using the specified encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to use for decompression.
|
||||
* @param codes The compressed audio data to decompress.
|
||||
* @param n_codes The number of codes in the codes buffer.
|
||||
* @param n_threads The number of threads to use for decompression.
|
||||
* @return True if the audio data was successfully decompressed, false otherwise.
|
||||
*/
|
||||
bool encodec_decompress_audio(
|
||||
struct encodec_context *ectx,
|
||||
const int32_t *codes,
|
||||
const int n_codes,
|
||||
int n_threads);
|
||||
|
||||
/**
|
||||
* Gets the audio data from the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to get the audio data from.
|
||||
* @return A pointer to the audio data.
|
||||
*/
|
||||
float * encodec_get_audio(
|
||||
struct encodec_context *ectx);
|
||||
|
||||
/**
|
||||
* Gets the size of the audio data from the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to get the audio size from.
|
||||
* @return The size of the audio data.
|
||||
*/
|
||||
int encodec_get_audio_size(
|
||||
struct encodec_context *ectx);
|
||||
|
||||
/**
|
||||
* Gets the code data from the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to get the code data from.
|
||||
* @return A pointer to the code data.
|
||||
*/
|
||||
int32_t * encodec_get_codes(
|
||||
struct encodec_context *ectx);
|
||||
|
||||
/**
|
||||
* Gets the size of the code data from the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to get the code size from.
|
||||
* @return The size of the code data.
|
||||
*/
|
||||
int encodec_get_codes_size(
|
||||
struct encodec_context *ectx);
|
||||
|
||||
/**
|
||||
* Gets the statistics for the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to get the statistics for.
|
||||
* @return A pointer to the statistics struct.
|
||||
*/
|
||||
const struct encodec_statistics* encodec_get_statistics(
|
||||
struct encodec_context *ectx);
|
||||
|
||||
/**
|
||||
* Reset the statistics for the given encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to reset the statistics for.
|
||||
*/
|
||||
void encodec_reset_statistics(
|
||||
struct encodec_context *ectx);
|
||||
|
||||
/**
|
||||
* @brief Frees the memory allocated for an encodec context.
|
||||
*
|
||||
* @param ectx The encodec context to free.
|
||||
*/
|
||||
void encodec_free(
|
||||
struct encodec_context *ectx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
109
vall_e.cpp/include/encoder.h
Normal file
109
vall_e.cpp/include/encoder.h
Normal file
|
@ -0,0 +1,109 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ggml.h"
|
||||
#include "lstm.h"
|
||||
|
||||
// res + downsample block at some ratio
|
||||
struct encodec_encoder_block {
|
||||
// conv1
|
||||
struct ggml_tensor *conv_1_w;
|
||||
struct ggml_tensor *conv_1_b;
|
||||
|
||||
// conv2
|
||||
struct ggml_tensor *conv_2_w;
|
||||
struct ggml_tensor *conv_2_b;
|
||||
|
||||
// shortcut
|
||||
struct ggml_tensor *conv_sc_w;
|
||||
struct ggml_tensor *conv_sc_b;
|
||||
|
||||
// downsampling layers
|
||||
struct ggml_tensor *ds_conv_w;
|
||||
struct ggml_tensor *ds_conv_b;
|
||||
};
|
||||
|
||||
struct encodec_encoder {
|
||||
struct ggml_tensor *init_conv_w;
|
||||
struct ggml_tensor *init_conv_b;
|
||||
|
||||
encodec_lstm lstm;
|
||||
|
||||
struct ggml_tensor *final_conv_w;
|
||||
struct ggml_tensor *final_conv_b;
|
||||
|
||||
std::vector<encodec_encoder_block> blocks;
|
||||
};
|
||||
|
||||
struct ggml_tensor *encodec_forward_encoder(
|
||||
const struct encodec_encoder *encoder, struct ggml_context *ctx0,
|
||||
struct ggml_tensor *inp, const int * ratios, const int kernel_size, const int res_kernel_size,
|
||||
const int stride) {
|
||||
|
||||
if (!inp) {
|
||||
fprintf(stderr, "%s: null input tensor\n", __func__);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct ggml_tensor *inpL = strided_conv_1d(
|
||||
ctx0, inp, encoder->init_conv_w, encoder->init_conv_b, stride);
|
||||
|
||||
for (int layer_ix = 0; layer_ix < 4; layer_ix++) {
|
||||
encodec_encoder_block block = encoder->blocks[layer_ix];
|
||||
|
||||
struct ggml_tensor *current = inpL;
|
||||
|
||||
// shortcut
|
||||
struct ggml_tensor *shortcut = strided_conv_1d(
|
||||
ctx0, inpL, block.conv_sc_w, block.conv_sc_b, stride);
|
||||
|
||||
// conv1
|
||||
current = ggml_elu(ctx0, current);
|
||||
|
||||
current = strided_conv_1d(
|
||||
ctx0, current, block.conv_1_w, block.conv_1_b, stride);
|
||||