2023-02-02 21:13:28 +00:00
import os
import argparse
import gradio as gr
2023-02-04 01:50:57 +00:00
import torch
2023-02-02 21:13:28 +00:00
import torchaudio
import time
2023-02-06 14:17:41 +00:00
import json
2023-02-06 16:32:09 +00:00
import base64
2023-02-04 01:50:57 +00:00
2023-02-02 21:13:28 +00:00
from datetime import datetime
from tortoise . api import TextToSpeech
from tortoise . utils . audio import load_audio , load_voice , load_voices
2023-02-05 06:17:51 +00:00
from tortoise . utils . text import split_and_recombine_text
2023-02-02 21:13:28 +00:00
2023-02-06 14:17:41 +00:00
import music_tag
2023-02-05 23:25:41 +00:00
def generate ( text , delimiter , emotion , prompt , voice , mic_audio , preset , seed , candidates , num_autoregressive_samples , diffusion_iterations , temperature , diffusion_sampler , breathing_room , experimentals , progress = gr . Progress ( ) ) :
2023-02-03 01:25:03 +00:00
if voice != " microphone " :
2023-02-02 21:13:28 +00:00
voices = [ voice ]
else :
voices = [ ]
2023-02-03 01:25:03 +00:00
if voice == " microphone " :
2023-02-02 21:13:28 +00:00
if mic_audio is None :
2023-02-03 01:25:03 +00:00
raise gr . Error ( " Please provide audio from mic when choosing `microphone` as a voice input " )
2023-02-04 01:50:57 +00:00
mic = load_audio ( mic_audio , 22050 )
voice_samples , conditioning_latents = [ mic ] , None
2023-02-02 21:13:28 +00:00
else :
2023-02-05 17:59:13 +00:00
progress ( 0 , desc = " Loading voice... " )
2023-02-04 01:50:57 +00:00
voice_samples , conditioning_latents = load_voice ( voice )
if voice_samples is not None :
sample_voice = voice_samples [ 0 ]
2023-02-06 05:10:07 +00:00
conditioning_latents = tts . get_conditioning_latents ( voice_samples , progress = progress , max_chunk_size = args . cond_latent_max_chunk_size )
2023-02-06 16:32:09 +00:00
if voice != " microphone " :
torch . save ( conditioning_latents , os . path . join ( f ' ./tortoise/voices/ { voice } / ' , f ' cond_latents.pth ' ) )
2023-02-04 01:50:57 +00:00
voice_samples = None
else :
sample_voice = None
2023-02-02 21:13:28 +00:00
2023-02-03 01:25:03 +00:00
if seed == 0 :
seed = None
2023-02-06 16:32:09 +00:00
print ( conditioning_latents )
2023-02-02 21:13:28 +00:00
start_time = time . time ( )
2023-02-03 04:20:01 +00:00
2023-02-04 01:50:57 +00:00
settings = {
' temperature ' : temperature , ' length_penalty ' : 1.0 , ' repetition_penalty ' : 2.0 ,
' top_p ' : .8 ,
' cond_free_k ' : 2.0 , ' diffusion_temperature ' : 1.0 ,
2023-02-05 05:51:57 +00:00
' num_autoregressive_samples ' : num_autoregressive_samples ,
' diffusion_iterations ' : diffusion_iterations ,
2023-02-04 01:50:57 +00:00
' voice_samples ' : voice_samples ,
' conditioning_latents ' : conditioning_latents ,
' use_deterministic_seed ' : seed ,
' return_deterministic_state ' : True ,
' k ' : candidates ,
2023-02-05 01:28:31 +00:00
' diffusion_sampler ' : diffusion_sampler ,
2023-02-05 14:45:51 +00:00
' breathing_room ' : breathing_room ,
2023-02-04 01:50:57 +00:00
' progress ' : progress ,
2023-02-05 23:25:41 +00:00
' half_p ' : " Half Precision " in experimentals ,
' cond_free ' : " Conditioning-Free " in experimentals ,
2023-02-04 01:50:57 +00:00
}
2023-02-05 03:53:46 +00:00
2023-02-05 06:17:51 +00:00
if delimiter == " \\ n " :
delimiter = " \n "
2023-02-02 21:13:28 +00:00
2023-02-05 06:17:51 +00:00
if delimiter != " " and delimiter in text :
texts = text . split ( delimiter )
else :
texts = split_and_recombine_text ( text )
2023-02-02 21:13:28 +00:00
timestamp = int ( time . time ( ) )
outdir = f " ./results/ { voice } / { timestamp } / "
2023-02-05 06:17:51 +00:00
2023-02-02 21:13:28 +00:00
os . makedirs ( outdir , exist_ok = True )
2023-02-05 06:17:51 +00:00
2023-02-05 06:35:32 +00:00
audio_cache = { }
2023-02-05 06:17:51 +00:00
for line , cut_text in enumerate ( texts ) :
2023-02-05 06:55:09 +00:00
if emotion == " Custom " and prompt . strip ( ) != " " :
cut_text = f " [ { prompt } ,] { cut_text } "
elif emotion != " None " :
cut_text = f " [I am really { emotion . lower ( ) } ,] { cut_text } "
2023-02-05 06:17:51 +00:00
print ( f " [ { str ( line + 1 ) } / { str ( len ( texts ) ) } ] Generating line: { cut_text } " )
gen , additionals = tts . tts ( cut_text , * * settings )
seed = additionals [ 0 ]
if isinstance ( gen , list ) :
for j , g in enumerate ( gen ) :
2023-02-05 06:35:32 +00:00
audio = g . squeeze ( 0 ) . cpu ( )
2023-02-06 14:17:41 +00:00
audio_cache [ f " candidate_ { j } /result_ { line } .wav " ] = {
' audio ' : audio ,
' text ' : cut_text ,
}
2023-02-05 06:35:32 +00:00
2023-02-05 06:17:51 +00:00
os . makedirs ( os . path . join ( outdir , f ' candidate_ { j } ' ) , exist_ok = True )
2023-02-05 06:35:32 +00:00
torchaudio . save ( os . path . join ( outdir , f ' candidate_ { j } /result_ { line } .wav ' ) , audio , 24000 )
2023-02-05 06:17:51 +00:00
else :
2023-02-05 06:35:32 +00:00
audio = gen . squeeze ( 0 ) . cpu ( )
2023-02-06 14:17:41 +00:00
audio_cache [ f " result_ { line } .wav " ] = {
' audio ' : audio ,
' text ' : cut_text ,
}
2023-02-05 06:35:32 +00:00
torchaudio . save ( os . path . join ( outdir , f ' result_ { line } .wav ' ) , audio , 24000 )
2023-02-05 06:17:51 +00:00
2023-02-05 06:35:32 +00:00
output_voice = None
if len ( texts ) > 1 :
for candidate in range ( candidates ) :
audio_clips = [ ]
for line in range ( len ( texts ) ) :
if isinstance ( gen , list ) :
2023-02-06 14:17:41 +00:00
audio = audio_cache [ f ' candidate_ { candidate } /result_ { line } .wav ' ] [ ' audio ' ]
2023-02-05 06:35:32 +00:00
else :
2023-02-06 14:17:41 +00:00
audio = audio_cache [ f ' result_ { line } .wav ' ] [ ' audio ' ]
audio_clips . append ( audio )
2023-02-05 06:35:32 +00:00
audio_clips = torch . cat ( audio_clips , dim = - 1 )
torchaudio . save ( os . path . join ( outdir , f ' combined_ { candidate } .wav ' ) , audio_clips , 24000 )
if output_voice is None :
output_voice = ( 24000 , audio_clips . squeeze ( ) . cpu ( ) . numpy ( ) )
else :
if isinstance ( gen , list ) :
output_voice = gen [ 0 ]
else :
output_voice = gen
output_voice = ( 24000 , output_voice . squeeze ( ) . cpu ( ) . numpy ( ) )
2023-02-06 14:17:41 +00:00
info = {
' text ' : text ,
2023-02-06 16:00:44 +00:00
' delimiter ' : ' \\ n ' if delimiter == " \n " else delimiter ,
2023-02-06 14:17:41 +00:00
' emotion ' : emotion ,
' prompt ' : prompt ,
' voice ' : voice ,
' mic_audio ' : mic_audio ,
' preset ' : preset ,
' seed ' : seed ,
' candidates ' : candidates ,
' num_autoregressive_samples ' : num_autoregressive_samples ,
' diffusion_iterations ' : diffusion_iterations ,
' temperature ' : temperature ,
' diffusion_sampler ' : diffusion_sampler ,
' breathing_room ' : breathing_room ,
' experimentals ' : experimentals ,
' time ' : time . time ( ) - start_time ,
}
2023-02-05 06:17:51 +00:00
with open ( os . path . join ( outdir , f ' input.txt ' ) , ' w ' , encoding = " utf-8 " ) as f :
2023-02-06 14:17:41 +00:00
f . write ( json . dumps ( info , indent = ' \t ' ) )
2023-02-02 21:13:28 +00:00
2023-02-06 16:32:09 +00:00
if voice is not None and conditioning_latents is not None :
with open ( os . path . join ( f ' ./tortoise/voices/ { voice } / ' , f ' cond_latents.pth ' ) , ' rb ' ) as f :
info [ ' latents ' ] = base64 . b64encode ( f . read ( ) ) . decode ( " ascii " )
2023-02-04 01:50:57 +00:00
2023-02-05 06:17:51 +00:00
print ( f " Saved to ' { outdir } ' " )
2023-02-06 14:17:41 +00:00
for path in audio_cache :
info [ ' text ' ] = audio_cache [ path ] [ ' text ' ]
metadata = music_tag . load_file ( os . path . join ( outdir , path ) )
metadata [ ' lyrics ' ] = json . dumps ( info )
metadata . save ( )
2023-02-05 06:17:51 +00:00
2023-02-04 01:50:57 +00:00
if sample_voice is not None :
sample_voice = ( 22050 , sample_voice . squeeze ( ) . cpu ( ) . numpy ( ) )
2023-02-05 06:17:51 +00:00
audio_clips = [ ]
2023-02-04 01:50:57 +00:00
return (
sample_voice ,
2023-02-05 06:17:51 +00:00
output_voice ,
2023-02-04 01:50:57 +00:00
seed
)
2023-02-05 03:53:46 +00:00
def update_presets ( value ) :
PRESETS = {
' Ultra Fast ' : { ' num_autoregressive_samples ' : 16 , ' diffusion_iterations ' : 30 , ' cond_free ' : False } ,
' Fast ' : { ' num_autoregressive_samples ' : 96 , ' diffusion_iterations ' : 80 } ,
' Standard ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 200 } ,
' High Quality ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 400 } ,
}
if value in PRESETS :
preset = PRESETS [ value ]
return ( gr . update ( value = preset [ ' num_autoregressive_samples ' ] ) , gr . update ( value = preset [ ' diffusion_iterations ' ] ) )
else :
return ( gr . update ( ) , gr . update ( ) )
2023-02-02 21:13:28 +00:00
2023-02-06 16:32:09 +00:00
def read_metadata ( file , save_latents = True ) :
2023-02-06 16:00:44 +00:00
j = None
2023-02-06 16:32:09 +00:00
latents = None
2023-02-06 16:00:44 +00:00
if file is not None :
metadata = music_tag . load_file ( file . name )
if ' lyrics ' in metadata :
j = json . loads ( str ( metadata [ ' lyrics ' ] ) )
2023-02-06 16:32:09 +00:00
if ' latents ' in j and save_latents :
latents = base64 . b64decode ( j [ ' latents ' ] )
del j [ ' latents ' ]
if latents and save_latents :
outdir = ' /voices/.temp/ '
os . makedirs ( os . path . join ( outdir ) , exist_ok = True )
with open ( os . path . join ( outdir , ' cond_latents.pth ' ) , ' wb ' ) as f :
f . write ( latents )
latents = os . path . join ( outdir , ' cond_latents.pth ' )
return (
j ,
latents
)
2023-02-06 16:00:44 +00:00
def copy_settings ( file ) :
2023-02-06 16:32:09 +00:00
metadata , latents = read_metadata ( file , save_latents = False )
2023-02-06 16:00:44 +00:00
if metadata is None :
return None
return (
metadata [ ' text ' ] ,
metadata [ ' delimiter ' ] ,
metadata [ ' emotion ' ] ,
metadata [ ' prompt ' ] ,
metadata [ ' voice ' ] ,
metadata [ ' mic_audio ' ] ,
metadata [ ' preset ' ] ,
metadata [ ' seed ' ] ,
metadata [ ' candidates ' ] ,
metadata [ ' num_autoregressive_samples ' ] ,
metadata [ ' diffusion_iterations ' ] ,
metadata [ ' temperature ' ] ,
metadata [ ' diffusion_sampler ' ] ,
metadata [ ' breathing_room ' ] ,
metadata [ ' experimentals ' ] ,
)
2023-02-05 17:59:13 +00:00
def update_voices ( ) :
return gr . Dropdown . update ( choices = os . listdir ( os . path . join ( " tortoise " , " voices " ) ) + [ " microphone " ] )
2023-02-05 03:53:46 +00:00
def main ( ) :
2023-02-06 16:00:44 +00:00
with gr . Blocks ( ) as webui :
with gr . Tab ( " Generate " ) :
with gr . Row ( ) :
with gr . Column ( ) :
text = gr . Textbox ( lines = 4 , label = " Prompt " )
delimiter = gr . Textbox ( lines = 1 , label = " Line Delimiter " , placeholder = " \\ n " )
emotion = gr . Radio (
[ " None " , " Happy " , " Sad " , " Angry " , " Disgusted " , " Arrogant " , " Custom " ] ,
value = " None " ,
label = " Emotion " ,
type = " value " ,
interactive = True
)
prompt = gr . Textbox ( lines = 1 , label = " Custom Emotion + Prompt (if selected) " )
voice = gr . Dropdown (
os . listdir ( os . path . join ( " tortoise " , " voices " ) ) + [ " microphone " ] ,
label = " Voice " ,
type = " value " ,
)
mic_audio = gr . Audio (
label = " Microphone Source " ,
source = " microphone " ,
type = " filepath " ,
)
refresh_voices = gr . Button ( value = " Refresh Voice List " )
refresh_voices . click ( update_voices ,
inputs = None ,
outputs = voice
)
prompt . change ( fn = lambda value : gr . update ( value = " Custom " ) ,
inputs = prompt ,
outputs = emotion
)
mic_audio . change ( fn = lambda value : gr . update ( value = " microphone " ) ,
inputs = mic_audio ,
outputs = voice
)
with gr . Column ( ) :
candidates = gr . Slider ( value = 1 , minimum = 1 , maximum = 6 , step = 1 , label = " Candidates " )
seed = gr . Number ( value = 0 , precision = 0 , label = " Seed " )
preset = gr . Radio (
[ " Ultra Fast " , " Fast " , " Standard " , " High Quality " , " None " ] ,
value = " None " ,
label = " Preset " ,
type = " value " ,
)
num_autoregressive_samples = gr . Slider ( value = 128 , minimum = 0 , maximum = 512 , step = 1 , label = " Samples " )
diffusion_iterations = gr . Slider ( value = 128 , minimum = 0 , maximum = 512 , step = 1 , label = " Iterations " )
temperature = gr . Slider ( value = 0.2 , minimum = 0 , maximum = 1 , step = 0.1 , label = " Temperature " )
breathing_room = gr . Slider ( value = 12 , minimum = 1 , maximum = 32 , step = 1 , label = " Pause Size " )
diffusion_sampler = gr . Radio (
[ " P " , " DDIM " ] , # + ["K_Euler_A", "DPM++2M"],
value = " P " ,
label = " Diffusion Samplers " ,
type = " value " ,
)
experimentals = gr . CheckboxGroup ( [ " Half Precision " , " Conditioning-Free " ] , value = [ " Conditioning-Free " ] , label = " Experimental Flags " )
preset . change ( fn = update_presets ,
inputs = preset ,
outputs = [
num_autoregressive_samples ,
diffusion_iterations ,
] ,
)
with gr . Column ( ) :
selected_voice = gr . Audio ( label = " Source Sample " )
output_audio = gr . Audio ( label = " Output " )
usedSeed = gr . Textbox ( label = " Seed " , placeholder = " 0 " , interactive = False )
submit = gr . Button ( value = " Generate " )
#stop = gr.Button(value="Stop")
input_settings = [
2023-02-05 03:53:46 +00:00
text ,
2023-02-05 06:17:51 +00:00
delimiter ,
2023-02-05 03:53:46 +00:00
emotion ,
prompt ,
voice ,
mic_audio ,
preset ,
seed ,
candidates ,
num_autoregressive_samples ,
diffusion_iterations ,
temperature ,
2023-02-05 14:45:51 +00:00
diffusion_sampler ,
2023-02-05 23:25:41 +00:00
breathing_room ,
experimentals ,
2023-02-06 16:00:44 +00:00
]
submit_event = submit . click ( generate ,
inputs = input_settings ,
outputs = [ selected_voice , output_audio , usedSeed ] ,
)
#stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_event])
with gr . Tab ( " Utilities " ) :
with gr . Row ( ) :
with gr . Column ( ) :
audio_in = gr . File ( type = " file " , label = " Audio Input " , file_types = [ " audio " ] )
copy_button = gr . Button ( value = " Copy Settings " )
with gr . Column ( ) :
metadata_out = gr . JSON ( label = " Audio Metadata " )
2023-02-06 16:32:09 +00:00
latents_out = gr . File ( type = " binary " , label = " Voice Latents " )
2023-02-06 16:00:44 +00:00
audio_in . upload (
fn = read_metadata ,
inputs = audio_in ,
2023-02-06 16:32:09 +00:00
outputs = [
metadata_out ,
latents_out
]
2023-02-06 16:00:44 +00:00
)
2023-02-05 03:53:46 +00:00
2023-02-06 16:00:44 +00:00
copy_button . click ( copy_settings ,
inputs = audio_in , # JSON elements cannt be used as inputs
outputs = input_settings
)
2023-02-05 05:51:57 +00:00
2023-02-06 16:00:44 +00:00
webui . queue ( ) . launch ( share = args . share )
2023-02-02 21:13:28 +00:00
if __name__ == " __main__ " :
2023-02-04 01:50:57 +00:00
parser = argparse . ArgumentParser ( )
parser . add_argument ( " --share " , action = ' store_true ' , help = " Lets Gradio return a public URL to use anywhere " )
parser . add_argument ( " --low-vram " , action = ' store_true ' , help = " Disables some optimizations that increases VRAM usage " )
2023-02-06 14:17:41 +00:00
parser . add_argument ( " --cond-latent-max-chunk-size " , type = int , default = 1000000 , help = " Sets an upper limit to audio chunk size when computing conditioning latents " )
2023-02-04 01:50:57 +00:00
args = parser . parse_args ( )
2023-02-06 16:00:44 +00:00
print ( " Initializating TorToiSe... " )
2023-02-04 01:50:57 +00:00
tts = TextToSpeech ( minor_optimizations = not args . low_vram )
2023-02-03 01:25:03 +00:00
main ( )