waveform from vall-e is directly converted to 16-bit pcm
This commit is contained in:
parent
81da764d6b
commit
4299be4646
@ -11,5 +11,8 @@ namespace ext {
|
||||
void UF_API stream( uf::Audio::Metadata& );
|
||||
void UF_API update( uf::Audio::Metadata& );
|
||||
void UF_API close( uf::Audio::Metadata& );
|
||||
|
||||
uf::stl::vector<int16_t> UF_API convertTo16bit( const uf::stl::vector<float>& );
|
||||
uf::stl::vector<int16_t> UF_API convertTo16bit( const float*, size_t );
|
||||
}
|
||||
}
|
||||
@ -9,9 +9,11 @@
|
||||
#include <uf/ext/oal/buffer.h>
|
||||
#include <uf/utils/time/time.h>
|
||||
|
||||
// shoved here because dependencies
|
||||
namespace pod {
|
||||
// this technically could either be a template or have the samples buffer be uint8_t and store the bit depth / an enum for the format but I only really care about supporting 16-bit PCMs
|
||||
struct UF_API PCM {
|
||||
uf::stl::vector<float> waveform;
|
||||
uf::stl::vector<int16_t> samples;
|
||||
uint16_t sampleRate = 24000;
|
||||
uint16_t channels = 1;
|
||||
};
|
||||
|
||||
@ -14,8 +14,8 @@ void ext::pcm::open( uf::Audio::Metadata& metadata, const pod::PCM& pcm ) {
|
||||
metadata.info.channels = pcm.channels;
|
||||
metadata.info.bitDepth = 16;
|
||||
metadata.info.frequency = pcm.sampleRate;
|
||||
metadata.info.duration = double(pcm.waveform.size()) / pcm.channels / pcm.sampleRate;
|
||||
metadata.info.size = pcm.waveform.size() * sizeof(int16_t);
|
||||
metadata.info.duration = double(pcm.samples.size()) / pcm.channels / pcm.sampleRate;
|
||||
metadata.info.size = pcm.samples.size() * sizeof(int16_t);
|
||||
|
||||
|
||||
// Determine OpenAL format
|
||||
@ -28,15 +28,9 @@ void ext::pcm::open( uf::Audio::Metadata& metadata, const pod::PCM& pcm ) {
|
||||
return;
|
||||
}
|
||||
|
||||
metadata.stream.handle = malloc( metadata.info.size );
|
||||
metadata.stream.consumed = 0;
|
||||
|
||||
// Convert float waveform to int16_t PCM
|
||||
int16_t* pcm16 = (int16_t*) metadata.stream.handle;
|
||||
for (size_t i = 0; i < pcm.waveform.size(); ++i) {
|
||||
float sample = std::clamp(pcm.waveform[i], -1.0f, 1.0f);
|
||||
pcm16[i] = static_cast<int16_t>(sample * 32767.0f);
|
||||
}
|
||||
metadata.stream.handle = malloc( metadata.info.size ); // to-do: use builtin memory pools i cant be assed
|
||||
memcpy( metadata.stream.handle, pcm.samples.data(), metadata.info.size );
|
||||
|
||||
// choose load or stream
|
||||
return metadata.settings.streamed ? ext::pcm::stream(metadata) : ext::pcm::load(metadata);
|
||||
@ -146,4 +140,16 @@ void ext::pcm::close(uf::Audio::Metadata& metadata) {
|
||||
}
|
||||
}
|
||||
|
||||
uf::stl::vector<int16_t> ext::pcm::convertTo16bit( const uf::stl::vector<float>& waveform ) {
|
||||
return ext::pcm::convertTo16bit( waveform.data(), waveform.size() );
|
||||
}
|
||||
uf::stl::vector<int16_t> ext::pcm::convertTo16bit( const float* data, size_t len ) {
|
||||
uf::stl::vector<int16_t> samples( len );
|
||||
for (size_t i = 0; i < len; ++i) {
|
||||
float sample = std::clamp(data[i], -1.0f, 1.0f);
|
||||
samples[i] = static_cast<int16_t>(sample * 32767.0f);
|
||||
}
|
||||
return samples;
|
||||
}
|
||||
|
||||
#endif
|
||||
@ -1,5 +1,6 @@
|
||||
#include <uf/ext/vall_e/vall_e.h>
|
||||
#include <uf/utils/time/time.h>
|
||||
#include <uf/ext/audio/pcm.h>
|
||||
|
||||
#if UF_USE_VALL_E
|
||||
namespace {
|
||||
@ -32,14 +33,13 @@ pod::PCM ext::vall_e::generate( const std::string& text, const std::string& prom
|
||||
args.language = lang == "" ? "en" : lang;
|
||||
args.task = "tts";
|
||||
args.modality = MODALITY_NAR_LEN;
|
||||
args.max_steps = 30;
|
||||
args.max_steps = 15;
|
||||
args.max_duration = MAX_DURATION;
|
||||
|
||||
auto inputs = vall_e_prepare_inputs( ::ctx, args.text, args.prompt_path, args.language );
|
||||
auto output_audio_codes = vall_e_generate( ::ctx, inputs, args.max_steps, args.max_duration, args.modality );
|
||||
auto waveform = decode_audio( ::ctx->encodec.ctx, output_audio_codes );
|
||||
|
||||
pcm.waveform.insert( pcm.waveform.end(), waveform.begin(), waveform.end() ); // because technically im using different vector classes
|
||||
pcm.samples = ext::pcm::convertTo16bit( waveform.data(), waveform.size() ); // need to cringily pass it this way because theyre different vector classes
|
||||
pcm.sampleRate = 24000; // should deduce from the backend in the event I ever get around to porting the other models
|
||||
pcm.channels = 1;
|
||||
|
||||
|
||||
@ -2,9 +2,6 @@
|
||||
#include <uf/utils/string/ext.h>
|
||||
|
||||
#if UF_USE_OPENAL
|
||||
#include <uf/ext/audio/vorbis.h>
|
||||
#include <uf/ext/audio/wav.h>
|
||||
#include <uf/ext/audio/pcm.h>
|
||||
#include <uf/ext/oal/oal.h>
|
||||
#endif
|
||||
|
||||
|
||||
@ -1,11 +1,6 @@
|
||||
#include <uf/utils/audio/audio.h>
|
||||
#include <uf/utils/string/ext.h>
|
||||
|
||||
#if defined(UF_USE_OPENAL)
|
||||
#include <uf/ext/vorbis/vorbis.h>
|
||||
#include <uf/ext/oal/oal.h>
|
||||
#endif
|
||||
|
||||
uf::SoundEmitter::~SoundEmitter() {
|
||||
this->cleanup(true);
|
||||
}
|
||||
|
||||
@ -739,13 +739,11 @@ void EXT_API ext::initialize() {
|
||||
uf::thread::queue( uf::thread::asyncThreadName, [=](){
|
||||
auto waveform = ext::vall_e::generate( text, prom );
|
||||
if ( callback != "" ) {
|
||||
UF_MSG_DEBUG("Calling hook: {}", callback);
|
||||
uf::hooks.call( callback, waveform );
|
||||
}
|
||||
if ( play ) {
|
||||
uf::Audio audio;
|
||||
audio.load( waveform );
|
||||
audio.setVolume( 4.0f );
|
||||
audio.play();
|
||||
}
|
||||
});
|
||||
|
||||
Loading…
Reference in New Issue
Block a user