waveform from vall-e is directly converted to 16-bit pcm

This commit is contained in:
ecker 2025-08-03 19:55:35 -05:00
parent 81da764d6b
commit 4299be4646
7 changed files with 25 additions and 24 deletions

View File

@ -11,5 +11,8 @@ namespace ext {
void UF_API stream( uf::Audio::Metadata& );
void UF_API update( uf::Audio::Metadata& );
void UF_API close( uf::Audio::Metadata& );
uf::stl::vector<int16_t> UF_API convertTo16bit( const uf::stl::vector<float>& );
uf::stl::vector<int16_t> UF_API convertTo16bit( const float*, size_t );
}
}

View File

@ -9,9 +9,11 @@
#include <uf/ext/oal/buffer.h>
#include <uf/utils/time/time.h>
// shoved here because dependencies
namespace pod {
// this technically could either be a template or have the samples buffer be uint8_t and store the bit depth / an enum for the format but I only really care about supporting 16-bit PCMs
struct UF_API PCM {
uf::stl::vector<float> waveform;
uf::stl::vector<int16_t> samples;
uint16_t sampleRate = 24000;
uint16_t channels = 1;
};

View File

@ -14,8 +14,8 @@ void ext::pcm::open( uf::Audio::Metadata& metadata, const pod::PCM& pcm ) {
metadata.info.channels = pcm.channels;
metadata.info.bitDepth = 16;
metadata.info.frequency = pcm.sampleRate;
metadata.info.duration = double(pcm.waveform.size()) / pcm.channels / pcm.sampleRate;
metadata.info.size = pcm.waveform.size() * sizeof(int16_t);
metadata.info.duration = double(pcm.samples.size()) / pcm.channels / pcm.sampleRate;
metadata.info.size = pcm.samples.size() * sizeof(int16_t);
// Determine OpenAL format
@ -28,15 +28,9 @@ void ext::pcm::open( uf::Audio::Metadata& metadata, const pod::PCM& pcm ) {
return;
}
metadata.stream.handle = malloc( metadata.info.size );
metadata.stream.consumed = 0;
// Convert float waveform to int16_t PCM
int16_t* pcm16 = (int16_t*) metadata.stream.handle;
for (size_t i = 0; i < pcm.waveform.size(); ++i) {
float sample = std::clamp(pcm.waveform[i], -1.0f, 1.0f);
pcm16[i] = static_cast<int16_t>(sample * 32767.0f);
}
metadata.stream.handle = malloc( metadata.info.size ); // to-do: use builtin memory pools i cant be assed
memcpy( metadata.stream.handle, pcm.samples.data(), metadata.info.size );
// choose load or stream
return metadata.settings.streamed ? ext::pcm::stream(metadata) : ext::pcm::load(metadata);
@ -146,4 +140,16 @@ void ext::pcm::close(uf::Audio::Metadata& metadata) {
}
}
uf::stl::vector<int16_t> ext::pcm::convertTo16bit( const uf::stl::vector<float>& waveform ) {
return ext::pcm::convertTo16bit( waveform.data(), waveform.size() );
}
uf::stl::vector<int16_t> ext::pcm::convertTo16bit( const float* data, size_t len ) {
uf::stl::vector<int16_t> samples( len );
for (size_t i = 0; i < len; ++i) {
float sample = std::clamp(data[i], -1.0f, 1.0f);
samples[i] = static_cast<int16_t>(sample * 32767.0f);
}
return samples;
}
#endif

View File

@ -1,5 +1,6 @@
#include <uf/ext/vall_e/vall_e.h>
#include <uf/utils/time/time.h>
#include <uf/ext/audio/pcm.h>
#if UF_USE_VALL_E
namespace {
@ -32,14 +33,13 @@ pod::PCM ext::vall_e::generate( const std::string& text, const std::string& prom
args.language = lang == "" ? "en" : lang;
args.task = "tts";
args.modality = MODALITY_NAR_LEN;
args.max_steps = 30;
args.max_steps = 15;
args.max_duration = MAX_DURATION;
auto inputs = vall_e_prepare_inputs( ::ctx, args.text, args.prompt_path, args.language );
auto output_audio_codes = vall_e_generate( ::ctx, inputs, args.max_steps, args.max_duration, args.modality );
auto waveform = decode_audio( ::ctx->encodec.ctx, output_audio_codes );
pcm.waveform.insert( pcm.waveform.end(), waveform.begin(), waveform.end() ); // because technically im using different vector classes
pcm.samples = ext::pcm::convertTo16bit( waveform.data(), waveform.size() ); // need to cringily pass it this way because theyre different vector classes
pcm.sampleRate = 24000; // should deduce from the backend in the event I ever get around to porting the other models
pcm.channels = 1;

View File

@ -2,9 +2,6 @@
#include <uf/utils/string/ext.h>
#if UF_USE_OPENAL
#include <uf/ext/audio/vorbis.h>
#include <uf/ext/audio/wav.h>
#include <uf/ext/audio/pcm.h>
#include <uf/ext/oal/oal.h>
#endif

View File

@ -1,11 +1,6 @@
#include <uf/utils/audio/audio.h>
#include <uf/utils/string/ext.h>
#if defined(UF_USE_OPENAL)
#include <uf/ext/vorbis/vorbis.h>
#include <uf/ext/oal/oal.h>
#endif
uf::SoundEmitter::~SoundEmitter() {
this->cleanup(true);
}

View File

@ -739,13 +739,11 @@ void EXT_API ext::initialize() {
uf::thread::queue( uf::thread::asyncThreadName, [=](){
auto waveform = ext::vall_e::generate( text, prom );
if ( callback != "" ) {
UF_MSG_DEBUG("Calling hook: {}", callback);
uf::hooks.call( callback, waveform );
}
if ( play ) {
uf::Audio audio;
audio.load( waveform );
audio.setVolume( 4.0f );
audio.play();
}
});