2023-02-02 21:13:28 +00:00
import os
import argparse
import time
2023-02-06 14:17:41 +00:00
import json
2023-02-06 16:32:09 +00:00
import base64
2023-02-06 21:43:01 +00:00
import re
import urllib . request
import torch
import torchaudio
import music_tag
import gradio as gr
2023-02-07 21:44:16 +00:00
import gradio . utils
2023-02-04 01:50:57 +00:00
2023-02-02 21:13:28 +00:00
from datetime import datetime
2023-02-06 21:43:01 +00:00
2023-02-09 20:42:38 +00:00
from fastapi import FastAPI
2023-02-02 21:13:28 +00:00
from tortoise . api import TextToSpeech
from tortoise . utils . audio import load_audio , load_voice , load_voices
2023-02-05 06:17:51 +00:00
from tortoise . utils . text import split_and_recombine_text
2023-02-02 21:13:28 +00:00
2023-02-07 20:55:56 +00:00
def generate ( text , delimiter , emotion , prompt , voice , mic_audio , seed , candidates , num_autoregressive_samples , diffusion_iterations , temperature , diffusion_sampler , breathing_room , cvvp_weight , experimentals , progress = gr . Progress ( track_tqdm = True ) ) :
2023-02-10 03:02:09 +00:00
try :
tts
except NameError :
raise gr . Error ( " TTS is still initializing... " )
2023-02-03 01:25:03 +00:00
if voice != " microphone " :
2023-02-02 21:13:28 +00:00
voices = [ voice ]
else :
voices = [ ]
2023-02-03 01:25:03 +00:00
if voice == " microphone " :
2023-02-02 21:13:28 +00:00
if mic_audio is None :
2023-02-03 01:25:03 +00:00
raise gr . Error ( " Please provide audio from mic when choosing `microphone` as a voice input " )
2023-02-07 18:34:29 +00:00
mic = load_audio ( mic_audio , tts . input_sample_rate )
2023-02-04 01:50:57 +00:00
voice_samples , conditioning_latents = [ mic ] , None
2023-02-02 21:13:28 +00:00
else :
2023-02-05 17:59:13 +00:00
progress ( 0 , desc = " Loading voice... " )
2023-02-04 01:50:57 +00:00
voice_samples , conditioning_latents = load_voice ( voice )
2023-02-09 01:53:25 +00:00
2023-02-04 01:50:57 +00:00
if voice_samples is not None :
2023-02-10 03:02:09 +00:00
sample_voice = voice_samples [ 0 ] . squeeze ( ) . cpu ( )
2023-02-08 14:14:28 +00:00
conditioning_latents = tts . get_conditioning_latents ( voice_samples , return_mels = not args . latents_lean_and_mean , progress = progress , max_chunk_size = args . cond_latent_max_chunk_size )
if len ( conditioning_latents ) == 4 :
conditioning_latents = ( conditioning_latents [ 0 ] , conditioning_latents [ 1 ] , conditioning_latents [ 2 ] , None )
2023-02-06 16:32:09 +00:00
if voice != " microphone " :
2023-02-06 21:43:01 +00:00
torch . save ( conditioning_latents , f ' ./tortoise/voices/ { voice } /cond_latents.pth ' )
2023-02-04 01:50:57 +00:00
voice_samples = None
else :
sample_voice = None
2023-02-02 21:13:28 +00:00
2023-02-03 01:25:03 +00:00
if seed == 0 :
seed = None
2023-02-07 20:55:56 +00:00
if conditioning_latents is not None and len ( conditioning_latents ) == 2 and cvvp_weight > 0 :
print ( " Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents. " )
cvvp_weight = 0
2023-02-03 04:20:01 +00:00
2023-02-04 01:50:57 +00:00
settings = {
' temperature ' : temperature , ' length_penalty ' : 1.0 , ' repetition_penalty ' : 2.0 ,
' top_p ' : .8 ,
' cond_free_k ' : 2.0 , ' diffusion_temperature ' : 1.0 ,
2023-02-05 05:51:57 +00:00
' num_autoregressive_samples ' : num_autoregressive_samples ,
2023-02-06 22:31:06 +00:00
' sample_batch_size ' : args . sample_batch_size ,
2023-02-05 05:51:57 +00:00
' diffusion_iterations ' : diffusion_iterations ,
2023-02-04 01:50:57 +00:00
' voice_samples ' : voice_samples ,
' conditioning_latents ' : conditioning_latents ,
' use_deterministic_seed ' : seed ,
' return_deterministic_state ' : True ,
' k ' : candidates ,
2023-02-05 01:28:31 +00:00
' diffusion_sampler ' : diffusion_sampler ,
2023-02-05 14:45:51 +00:00
' breathing_room ' : breathing_room ,
2023-02-04 01:50:57 +00:00
' progress ' : progress ,
2023-02-05 23:25:41 +00:00
' half_p ' : " Half Precision " in experimentals ,
' cond_free ' : " Conditioning-Free " in experimentals ,
2023-02-07 20:55:56 +00:00
' cvvp_amount ' : cvvp_weight ,
2023-02-04 01:50:57 +00:00
}
2023-02-05 03:53:46 +00:00
2023-02-05 06:17:51 +00:00
if delimiter == " \\ n " :
delimiter = " \n "
2023-02-02 21:13:28 +00:00
2023-02-05 06:17:51 +00:00
if delimiter != " " and delimiter in text :
texts = text . split ( delimiter )
else :
texts = split_and_recombine_text ( text )
2023-02-10 03:02:09 +00:00
start_time = time . time ( )
2023-02-05 06:17:51 +00:00
2023-02-10 03:02:09 +00:00
outdir = f " ./results/ { voice } / { int ( start_time ) } / "
2023-02-02 21:13:28 +00:00
os . makedirs ( outdir , exist_ok = True )
2023-02-05 06:35:32 +00:00
audio_cache = { }
2023-02-10 03:02:09 +00:00
resampler = torchaudio . transforms . Resample (
tts . output_sample_rate ,
args . output_sample_rate ,
lowpass_filter_width = 16 ,
rolloff = 0.85 ,
resampling_method = " kaiser_window " ,
beta = 8.555504641634386 ,
) if tts . output_sample_rate != args . output_sample_rate else None
volume_adjust = torchaudio . transforms . Vol ( gain = args . output_volume , gain_type = " amplitude " ) if args . output_volume != 1 else None
2023-02-05 06:17:51 +00:00
for line , cut_text in enumerate ( texts ) :
2023-02-06 21:43:01 +00:00
if emotion == " Custom " :
if prompt . strip ( ) != " " :
cut_text = f " [ { prompt } ,] { cut_text } "
else :
2023-02-05 06:55:09 +00:00
cut_text = f " [I am really { emotion . lower ( ) } ,] { cut_text } "
2023-02-05 06:17:51 +00:00
print ( f " [ { str ( line + 1 ) } / { str ( len ( texts ) ) } ] Generating line: { cut_text } " )
gen , additionals = tts . tts ( cut_text , * * settings )
seed = additionals [ 0 ]
if isinstance ( gen , list ) :
for j , g in enumerate ( gen ) :
2023-02-10 03:02:09 +00:00
os . makedirs ( f ' { outdir } /candidate_ { j } ' , exist_ok = True )
2023-02-06 14:17:41 +00:00
audio_cache [ f " candidate_ { j } /result_ { line } .wav " ] = {
2023-02-10 03:02:09 +00:00
' audio ' : g ,
2023-02-06 14:17:41 +00:00
' text ' : cut_text ,
}
2023-02-05 06:17:51 +00:00
else :
2023-02-06 14:17:41 +00:00
audio_cache [ f " result_ { line } .wav " ] = {
2023-02-10 03:02:09 +00:00
' audio ' : gen ,
2023-02-06 14:17:41 +00:00
' text ' : cut_text ,
}
2023-02-10 03:02:09 +00:00
for k in audio_cache :
audio = audio_cache [ k ] [ ' audio ' ] . squeeze ( 0 ) . cpu ( )
if resampler is not None :
audio = resampler ( audio )
if volume_adjust is not None :
audio = volume_adjust ( audio )
audio_cache [ k ] [ ' audio ' ] = audio
torchaudio . save ( f ' { outdir } / { k } ' , audio , args . output_sample_rate )
2023-02-05 06:17:51 +00:00
2023-02-05 06:35:32 +00:00
output_voice = None
if len ( texts ) > 1 :
for candidate in range ( candidates ) :
audio_clips = [ ]
for line in range ( len ( texts ) ) :
if isinstance ( gen , list ) :
2023-02-06 14:17:41 +00:00
audio = audio_cache [ f ' candidate_ { candidate } /result_ { line } .wav ' ] [ ' audio ' ]
2023-02-05 06:35:32 +00:00
else :
2023-02-06 14:17:41 +00:00
audio = audio_cache [ f ' result_ { line } .wav ' ] [ ' audio ' ]
audio_clips . append ( audio )
2023-02-05 06:35:32 +00:00
2023-02-07 00:26:22 +00:00
audio = torch . cat ( audio_clips , dim = - 1 )
2023-02-10 03:02:09 +00:00
torchaudio . save ( f ' { outdir } /combined_ { candidate } .wav ' , audio , args . output_sample_rate )
2023-02-06 21:43:01 +00:00
2023-02-07 00:26:22 +00:00
audio = audio . squeeze ( 0 ) . cpu ( )
2023-02-06 21:43:01 +00:00
audio_cache [ f ' combined_ { candidate } .wav ' ] = {
' audio ' : audio ,
' text ' : cut_text ,
}
2023-02-05 06:35:32 +00:00
if output_voice is None :
2023-02-10 03:02:09 +00:00
output_voice = f ' { outdir } /combined_ { candidate } .wav '
# output_voice = audio
2023-02-05 06:35:32 +00:00
else :
if isinstance ( gen , list ) :
2023-02-10 03:02:09 +00:00
output_voice = f ' { outdir } /candidate_0/result_0.wav '
#output_voice = gen[0]
2023-02-05 06:35:32 +00:00
else :
2023-02-10 03:02:09 +00:00
output_voice = f ' { outdir } /result_0.wav '
#output_voice = gen
2023-02-06 14:17:41 +00:00
info = {
' text ' : text ,
2023-02-06 16:00:44 +00:00
' delimiter ' : ' \\ n ' if delimiter == " \n " else delimiter ,
2023-02-06 14:17:41 +00:00
' emotion ' : emotion ,
' prompt ' : prompt ,
' voice ' : voice ,
' mic_audio ' : mic_audio ,
' seed ' : seed ,
' candidates ' : candidates ,
' num_autoregressive_samples ' : num_autoregressive_samples ,
' diffusion_iterations ' : diffusion_iterations ,
' temperature ' : temperature ,
' diffusion_sampler ' : diffusion_sampler ,
' breathing_room ' : breathing_room ,
2023-02-07 20:55:56 +00:00
' cvvp_weight ' : cvvp_weight ,
2023-02-06 14:17:41 +00:00
' experimentals ' : experimentals ,
' time ' : time . time ( ) - start_time ,
}
2023-02-05 06:17:51 +00:00
2023-02-06 21:43:01 +00:00
with open ( f ' { outdir } /input.json ' , ' w ' , encoding = " utf-8 " ) as f :
2023-02-06 14:17:41 +00:00
f . write ( json . dumps ( info , indent = ' \t ' ) )
2023-02-02 21:13:28 +00:00
2023-02-06 16:32:09 +00:00
if voice is not None and conditioning_latents is not None :
2023-02-06 21:43:01 +00:00
with open ( f ' ./tortoise/voices/ { voice } /cond_latents.pth ' , ' rb ' ) as f :
2023-02-06 16:32:09 +00:00
info [ ' latents ' ] = base64 . b64encode ( f . read ( ) ) . decode ( " ascii " )
2023-02-04 01:50:57 +00:00
2023-02-08 14:14:28 +00:00
if args . embed_output_metadata :
for path in audio_cache :
info [ ' text ' ] = audio_cache [ path ] [ ' text ' ]
2023-02-06 14:17:41 +00:00
2023-02-08 14:14:28 +00:00
metadata = music_tag . load_file ( f " { outdir } / { path } " )
metadata [ ' lyrics ' ] = json . dumps ( info )
metadata . save ( )
2023-02-10 03:02:09 +00:00
#if output_voice is not None:
# output_voice = (args.output_sample_rate, output_voice.numpy())
2023-02-05 06:17:51 +00:00
2023-02-04 01:50:57 +00:00
if sample_voice is not None :
2023-02-10 03:02:09 +00:00
sample_voice = ( tts . input_sample_rate , sample_voice . numpy ( ) )
2023-02-05 06:17:51 +00:00
2023-02-07 13:51:05 +00:00
print ( f " Generation took { info [ ' time ' ] } seconds, saved to ' { outdir } ' \n " )
2023-02-06 21:43:01 +00:00
info [ ' seed ' ] = settings [ ' use_deterministic_seed ' ]
del info [ ' latents ' ]
with open ( f ' ./config/generate.json ' , ' w ' , encoding = " utf-8 " ) as f :
f . write ( json . dumps ( info , indent = ' \t ' ) )
2023-02-04 01:50:57 +00:00
return (
sample_voice ,
2023-02-05 06:17:51 +00:00
output_voice ,
2023-02-04 01:50:57 +00:00
seed
)
2023-02-05 03:53:46 +00:00
def update_presets ( value ) :
PRESETS = {
' Ultra Fast ' : { ' num_autoregressive_samples ' : 16 , ' diffusion_iterations ' : 30 , ' cond_free ' : False } ,
' Fast ' : { ' num_autoregressive_samples ' : 96 , ' diffusion_iterations ' : 80 } ,
' Standard ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 200 } ,
' High Quality ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 400 } ,
}
if value in PRESETS :
preset = PRESETS [ value ]
return ( gr . update ( value = preset [ ' num_autoregressive_samples ' ] ) , gr . update ( value = preset [ ' diffusion_iterations ' ] ) )
else :
return ( gr . update ( ) , gr . update ( ) )
2023-02-02 21:13:28 +00:00
2023-02-06 21:43:01 +00:00
def read_generate_settings ( file , save_latents = True ) :
2023-02-06 16:00:44 +00:00
j = None
2023-02-06 16:32:09 +00:00
latents = None
2023-02-06 16:00:44 +00:00
if file is not None :
2023-02-06 21:43:01 +00:00
if hasattr ( file , ' name ' ) :
metadata = music_tag . load_file ( file . name )
if ' lyrics ' in metadata :
j = json . loads ( str ( metadata [ ' lyrics ' ] ) )
elif file [ - 5 : ] == " .json " :
with open ( file , ' r ' ) as f :
j = json . load ( f )
if ' latents ' in j and save_latents :
latents = base64 . b64decode ( j [ ' latents ' ] )
del j [ ' latents ' ]
2023-02-06 16:32:09 +00:00
if latents and save_latents :
2023-02-09 02:39:08 +00:00
outdir = ' ./voices/.temp/ '
2023-02-06 21:43:01 +00:00
os . makedirs ( outdir , exist_ok = True )
with open ( f ' { outdir } /cond_latents.pth ' , ' wb ' ) as f :
2023-02-06 16:32:09 +00:00
f . write ( latents )
2023-02-06 21:43:01 +00:00
latents = f ' { outdir } /cond_latents.pth '
2023-02-06 16:32:09 +00:00
return (
j ,
latents
)
2023-02-06 16:00:44 +00:00
2023-02-06 21:43:01 +00:00
def import_generate_settings ( file = " ./config/generate.json " ) :
settings , _ = read_generate_settings ( file , save_latents = False )
2023-02-06 16:32:09 +00:00
2023-02-06 21:43:01 +00:00
if settings is None :
2023-02-06 16:00:44 +00:00
return None
return (
2023-02-07 20:55:56 +00:00
None if ' text ' not in settings else settings [ ' text ' ] ,
None if ' delimiter ' not in settings else settings [ ' delimiter ' ] ,
None if ' emotion ' not in settings else settings [ ' emotion ' ] ,
None if ' prompt ' not in settings else settings [ ' prompt ' ] ,
None if ' voice ' not in settings else settings [ ' voice ' ] ,
None if ' mic_audio ' not in settings else settings [ ' mic_audio ' ] ,
None if ' seed ' not in settings else settings [ ' seed ' ] ,
None if ' candidates ' not in settings else settings [ ' candidates ' ] ,
None if ' num_autoregressive_samples ' not in settings else settings [ ' num_autoregressive_samples ' ] ,
None if ' diffusion_iterations ' not in settings else settings [ ' diffusion_iterations ' ] ,
None if ' temperature ' not in settings else settings [ ' temperature ' ] ,
None if ' diffusion_sampler ' not in settings else settings [ ' diffusion_sampler ' ] ,
None if ' breathing_room ' not in settings else settings [ ' breathing_room ' ] ,
None if ' cvvp_weight ' not in settings else settings [ ' cvvp_weight ' ] ,
None if ' experimentals ' not in settings else settings [ ' experimentals ' ] ,
2023-02-06 16:00:44 +00:00
)
2023-02-06 21:43:01 +00:00
def curl ( url ) :
try :
req = urllib . request . Request ( url , headers = { ' User-Agent ' : ' Python ' } )
conn = urllib . request . urlopen ( req )
data = conn . read ( )
data = data . decode ( )
data = json . loads ( data )
conn . close ( )
return data
except Exception as e :
print ( e )
return None
def check_for_updates ( ) :
if not os . path . isfile ( ' ./.git/FETCH_HEAD ' ) :
print ( " Cannot check for updates: not from a git repo " )
return False
with open ( f ' ./.git/FETCH_HEAD ' , ' r ' , encoding = " utf-8 " ) as f :
head = f . read ( )
match = re . findall ( r " ^([a-f0-9]+).+?https: \ / \ /(.+?) \ /(.+?) \ /(.+?) \ n " , head )
if match is None or len ( match ) == 0 :
print ( " Cannot check for updates: cannot parse FETCH_HEAD " )
return False
match = match [ 0 ]
local = match [ 0 ]
host = match [ 1 ]
owner = match [ 2 ]
repo = match [ 3 ]
res = curl ( f " https:// { host } /api/v1/repos/ { owner } / { repo } /branches/ " ) #this only works for gitea instances
if res is None or len ( res ) == 0 :
print ( " Cannot check for updates: cannot fetch from remote " )
return False
remote = res [ 0 ] [ " commit " ] [ " id " ]
if remote != local :
print ( f " New version found: { local [ : 8 ] } => { remote [ : 8 ] } " )
return True
return False
2023-02-10 03:02:09 +00:00
def reload_tts ( ) :
tts = setup_tortoise ( )
2023-02-05 17:59:13 +00:00
def update_voices ( ) :
2023-02-07 03:54:31 +00:00
return gr . Dropdown . update ( choices = sorted ( os . listdir ( " ./tortoise/voices " ) ) + [ " microphone " ] )
2023-02-06 21:43:01 +00:00
2023-02-10 03:02:09 +00:00
def export_exec_settings ( share , listen , check_for_updates , models_from_local_only , low_vram , embed_output_metadata , latents_lean_and_mean , cond_latent_max_chunk_size , sample_batch_size , concurrency_count , output_sample_rate , output_volume ) :
2023-02-06 21:43:01 +00:00
args . share = share
2023-02-09 21:07:51 +00:00
args . listen = listen
2023-02-06 21:43:01 +00:00
args . low_vram = low_vram
args . check_for_updates = check_for_updates
2023-02-09 22:06:55 +00:00
args . models_from_local_only = models_from_local_only
2023-02-06 21:43:01 +00:00
args . cond_latent_max_chunk_size = cond_latent_max_chunk_size
2023-02-06 22:31:06 +00:00
args . sample_batch_size = sample_batch_size
2023-02-08 14:14:28 +00:00
args . embed_output_metadata = embed_output_metadata
args . latents_lean_and_mean = latents_lean_and_mean
2023-02-06 21:43:01 +00:00
args . concurrency_count = concurrency_count
2023-02-10 03:02:09 +00:00
args . output_sample_rate = output_sample_rate
args . output_volume = output_volume
2023-02-06 21:43:01 +00:00
settings = {
' share ' : args . share ,
2023-02-09 21:07:51 +00:00
' listen ' : args . listen ,
2023-02-06 21:43:01 +00:00
' low-vram ' : args . low_vram ,
' check-for-updates ' : args . check_for_updates ,
2023-02-09 22:06:55 +00:00
' models-from-local-only ' : args . models_from_local_only ,
2023-02-06 21:43:01 +00:00
' cond-latent-max-chunk-size ' : args . cond_latent_max_chunk_size ,
2023-02-06 22:31:06 +00:00
' sample-batch-size ' : args . sample_batch_size ,
2023-02-08 14:14:28 +00:00
' embed-output-metadata ' : args . embed_output_metadata ,
' latents-lean-and-mean ' : args . latents_lean_and_mean ,
2023-02-06 21:43:01 +00:00
' concurrency-count ' : args . concurrency_count ,
2023-02-10 03:02:09 +00:00
' output-sample-rate ' : args . output_sample_rate ,
' output-volume ' : args . output_volume ,
2023-02-06 21:43:01 +00:00
}
with open ( f ' ./config/exec.json ' , ' w ' , encoding = " utf-8 " ) as f :
f . write ( json . dumps ( settings , indent = ' \t ' ) )
2023-02-09 20:42:38 +00:00
def setup_args ( ) :
default_arguments = {
' share ' : False ,
2023-02-09 21:07:51 +00:00
' listen ' : None ,
2023-02-09 20:42:38 +00:00
' check-for-updates ' : False ,
2023-02-09 22:06:55 +00:00
' models-from-local-only ' : False ,
2023-02-09 20:42:38 +00:00
' low-vram ' : False ,
' sample-batch-size ' : None ,
' embed-output-metadata ' : True ,
' latents-lean-and-mean ' : True ,
' cond-latent-max-chunk-size ' : 1000000 ,
2023-02-10 03:02:09 +00:00
' concurrency-count ' : 2 ,
' output-sample-rate ' : 44100 ,
' output-volume ' : 1 ,
2023-02-09 20:42:38 +00:00
}
if os . path . isfile ( ' ./config/exec.json ' ) :
with open ( f ' ./config/exec.json ' , ' r ' , encoding = " utf-8 " ) as f :
overrides = json . load ( f )
for k in overrides :
default_arguments [ k ] = overrides [ k ]
parser = argparse . ArgumentParser ( )
parser . add_argument ( " --share " , action = ' store_true ' , default = default_arguments [ ' share ' ] , help = " Lets Gradio return a public URL to use anywhere " )
2023-02-09 21:07:51 +00:00
parser . add_argument ( " --listen " , default = default_arguments [ ' listen ' ] , help = " Path for Gradio to listen on " )
2023-02-09 20:42:38 +00:00
parser . add_argument ( " --check-for-updates " , action = ' store_true ' , default = default_arguments [ ' check-for-updates ' ] , help = " Checks for update on startup " )
2023-02-09 22:06:55 +00:00
parser . add_argument ( " --models-from-local-only " , action = ' store_true ' , default = default_arguments [ ' models-from-local-only ' ] , help = " Only loads models from disk, does not check for updates for models " )
2023-02-09 20:42:38 +00:00
parser . add_argument ( " --low-vram " , action = ' store_true ' , default = default_arguments [ ' low-vram ' ] , help = " Disables some optimizations that increases VRAM usage " )
parser . add_argument ( " --no-embed-output-metadata " , action = ' store_false ' , default = not default_arguments [ ' embed-output-metadata ' ] , help = " Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag) " )
parser . add_argument ( " --latents-lean-and-mean " , action = ' store_true ' , default = default_arguments [ ' latents-lean-and-mean ' ] , help = " Exports the bare essentials for latents. " )
parser . add_argument ( " --cond-latent-max-chunk-size " , default = default_arguments [ ' cond-latent-max-chunk-size ' ] , type = int , help = " Sets an upper limit to audio chunk size when computing conditioning latents " )
parser . add_argument ( " --sample-batch-size " , default = default_arguments [ ' sample-batch-size ' ] , type = int , help = " Sets an upper limit to audio chunk size when computing conditioning latents " )
parser . add_argument ( " --concurrency-count " , type = int , default = default_arguments [ ' concurrency-count ' ] , help = " How many Gradio events to process at once " )
2023-02-10 03:02:09 +00:00
parser . add_argument ( " --output-sample-rate " , type = int , default = default_arguments [ ' output-sample-rate ' ] , help = " Sample rate to resample the output to (from 24KHz) " )
parser . add_argument ( " --output-volume " , type = float , default = default_arguments [ ' output-volume ' ] , help = " Adjusts volume of output " )
2023-02-09 20:42:38 +00:00
args = parser . parse_args ( )
args . embed_output_metadata = not args . no_embed_output_metadata
2023-02-09 21:07:51 +00:00
args . listen_host = None
args . listen_port = None
args . listen_path = None
if args . listen is not None :
match = re . findall ( r " ^(?:(.+?):( \ d+))?( \ /.+?)?$ " , args . listen ) [ 0 ]
args . listen_host = match [ 0 ] if match [ 0 ] != " " else " 127.0.0.1 "
2023-02-10 03:02:09 +00:00
args . listen_port = match [ 1 ] if match [ 1 ] != " " else None
2023-02-09 21:07:51 +00:00
args . listen_path = match [ 2 ] if match [ 2 ] != " " else " / "
if args . listen_port is not None :
args . listen_port = int ( args . listen_port )
2023-02-09 20:42:38 +00:00
return args
def setup_tortoise ( ) :
print ( " Initializating TorToiSe... " )
tts = TextToSpeech ( minor_optimizations = not args . low_vram )
print ( " TorToiSe initialized, ready for generation. " )
return tts
def setup_gradio ( ) :
if not args . share :
def noop ( function , return_value = None ) :
def wrapped ( * args , * * kwargs ) :
return return_value
return wrapped
gradio . utils . version_check = noop ( gradio . utils . version_check )
gradio . utils . initiated_analytics = noop ( gradio . utils . initiated_analytics )
gradio . utils . launch_analytics = noop ( gradio . utils . launch_analytics )
gradio . utils . integration_analytics = noop ( gradio . utils . integration_analytics )
gradio . utils . error_analytics = noop ( gradio . utils . error_analytics )
gradio . utils . log_feature_analytics = noop ( gradio . utils . log_feature_analytics )
#gradio.utils.get_local_ip_address = noop(gradio.utils.get_local_ip_address, 'localhost')
2023-02-05 17:59:13 +00:00
2023-02-09 22:06:55 +00:00
if args . models_from_local_only :
os . environ [ ' TRANSFORMERS_OFFLINE ' ] = ' 1 '
2023-02-06 16:00:44 +00:00
with gr . Blocks ( ) as webui :
with gr . Tab ( " Generate " ) :
with gr . Row ( ) :
with gr . Column ( ) :
text = gr . Textbox ( lines = 4 , label = " Prompt " )
delimiter = gr . Textbox ( lines = 1 , label = " Line Delimiter " , placeholder = " \\ n " )
emotion = gr . Radio (
2023-02-06 21:43:01 +00:00
[ " Happy " , " Sad " , " Angry " , " Disgusted " , " Arrogant " , " Custom " ] ,
value = " Custom " ,
2023-02-06 16:00:44 +00:00
label = " Emotion " ,
type = " value " ,
interactive = True
)
prompt = gr . Textbox ( lines = 1 , label = " Custom Emotion + Prompt (if selected) " )
voice = gr . Dropdown (
2023-02-07 03:54:31 +00:00
sorted ( os . listdir ( " ./tortoise/voices " ) ) + [ " microphone " ] ,
2023-02-06 16:00:44 +00:00
label = " Voice " ,
type = " value " ,
)
mic_audio = gr . Audio (
label = " Microphone Source " ,
source = " microphone " ,
type = " filepath " ,
)
refresh_voices = gr . Button ( value = " Refresh Voice List " )
refresh_voices . click ( update_voices ,
inputs = None ,
outputs = voice
)
prompt . change ( fn = lambda value : gr . update ( value = " Custom " ) ,
inputs = prompt ,
outputs = emotion
)
mic_audio . change ( fn = lambda value : gr . update ( value = " microphone " ) ,
inputs = mic_audio ,
outputs = voice
)
with gr . Column ( ) :
candidates = gr . Slider ( value = 1 , minimum = 1 , maximum = 6 , step = 1 , label = " Candidates " )
seed = gr . Number ( value = 0 , precision = 0 , label = " Seed " )
preset = gr . Radio (
2023-02-06 21:43:01 +00:00
[ " Ultra Fast " , " Fast " , " Standard " , " High Quality " ] ,
2023-02-06 16:00:44 +00:00
label = " Preset " ,
type = " value " ,
)
num_autoregressive_samples = gr . Slider ( value = 128 , minimum = 0 , maximum = 512 , step = 1 , label = " Samples " )
diffusion_iterations = gr . Slider ( value = 128 , minimum = 0 , maximum = 512 , step = 1 , label = " Iterations " )
temperature = gr . Slider ( value = 0.2 , minimum = 0 , maximum = 1 , step = 0.1 , label = " Temperature " )
2023-02-07 00:26:22 +00:00
breathing_room = gr . Slider ( value = 8 , minimum = 1 , maximum = 32 , step = 1 , label = " Pause Size " )
2023-02-06 16:00:44 +00:00
diffusion_sampler = gr . Radio (
[ " P " , " DDIM " ] , # + ["K_Euler_A", "DPM++2M"],
value = " P " ,
label = " Diffusion Samplers " ,
type = " value " ,
)
preset . change ( fn = update_presets ,
inputs = preset ,
outputs = [
num_autoregressive_samples ,
diffusion_iterations ,
] ,
)
with gr . Column ( ) :
selected_voice = gr . Audio ( label = " Source Sample " )
output_audio = gr . Audio ( label = " Output " )
usedSeed = gr . Textbox ( label = " Seed " , placeholder = " 0 " , interactive = False )
submit = gr . Button ( value = " Generate " )
#stop = gr.Button(value="Stop")
with gr . Tab ( " Utilities " ) :
with gr . Row ( ) :
with gr . Column ( ) :
audio_in = gr . File ( type = " file " , label = " Audio Input " , file_types = [ " audio " ] )
copy_button = gr . Button ( value = " Copy Settings " )
with gr . Column ( ) :
metadata_out = gr . JSON ( label = " Audio Metadata " )
2023-02-06 16:32:09 +00:00
latents_out = gr . File ( type = " binary " , label = " Voice Latents " )
2023-02-06 16:00:44 +00:00
audio_in . upload (
2023-02-06 21:43:01 +00:00
fn = read_generate_settings ,
2023-02-06 16:00:44 +00:00
inputs = audio_in ,
2023-02-06 16:32:09 +00:00
outputs = [
metadata_out ,
latents_out
]
2023-02-06 16:00:44 +00:00
)
2023-02-06 21:43:01 +00:00
with gr . Tab ( " Settings " ) :
with gr . Row ( ) :
2023-02-10 03:02:09 +00:00
exec_inputs = [ ]
with gr . Column ( ) :
exec_inputs = exec_inputs + [
gr . Textbox ( label = " Listen " , value = args . listen , placeholder = " 127.0.0.1:7860/ " ) ,
gr . Checkbox ( label = " Public Share Gradio " , value = args . share ) ,
gr . Checkbox ( label = " Check For Updates " , value = args . check_for_updates ) ,
gr . Checkbox ( label = " Only Load Models Locally " , value = args . models_from_local_only ) ,
gr . Checkbox ( label = " Low VRAM " , value = args . low_vram ) ,
gr . Checkbox ( label = " Embed Output Metadata " , value = args . embed_output_metadata ) ,
gr . Checkbox ( label = " Slimmer Computed Latents " , value = args . latents_lean_and_mean ) ,
]
gr . Button ( value = " Check for Updates " ) . click ( check_for_updates )
with gr . Column ( ) :
exec_inputs = exec_inputs + [
gr . Number ( label = " Voice Latents Max Chunk Size " , precision = 0 , value = args . cond_latent_max_chunk_size ) ,
gr . Number ( label = " Sample Batch Size " , precision = 0 , value = args . sample_batch_size ) ,
gr . Number ( label = " Concurrency Count " , precision = 0 , value = args . concurrency_count ) ,
gr . Number ( label = " Ouptut Sample Rate " , precision = 0 , value = args . output_sample_rate ) ,
gr . Slider ( label = " Ouptut Volume " , minimum = 0 , maximum = 2 , value = args . output_volume ) ,
]
for i in exec_inputs :
i . change (
fn = export_exec_settings ,
inputs = exec_inputs
)
2023-02-06 21:43:01 +00:00
with gr . Column ( ) :
experimentals = gr . CheckboxGroup ( [ " Half Precision " , " Conditioning-Free " ] , value = [ " Conditioning-Free " ] , label = " Experimental Flags " )
2023-02-07 20:55:56 +00:00
cvvp_weight = gr . Slider ( value = 0 , minimum = 0 , maximum = 1 , label = " CVVP Weight " )
2023-02-06 21:43:01 +00:00
2023-02-10 03:02:09 +00:00
gr . Button ( value = " Reload TTS " ) . click ( reload_tts )
2023-02-06 21:43:01 +00:00
input_settings = [
text ,
delimiter ,
emotion ,
prompt ,
voice ,
mic_audio ,
seed ,
candidates ,
num_autoregressive_samples ,
diffusion_iterations ,
temperature ,
diffusion_sampler ,
breathing_room ,
2023-02-07 20:55:56 +00:00
cvvp_weight ,
2023-02-06 21:43:01 +00:00
experimentals ,
]
submit_event = submit . click ( generate ,
inputs = input_settings ,
outputs = [ selected_voice , output_audio , usedSeed ] ,
)
copy_button . click ( import_generate_settings ,
inputs = audio_in , # JSON elements cannt be used as inputs
outputs = input_settings
)
if os . path . isfile ( ' ./config/generate.json ' ) :
webui . load ( import_generate_settings , inputs = None , outputs = input_settings )
if args . check_for_updates :
webui . load ( check_for_updates )
#stop.click(fn=None, inputs=None, outputs=None, cancels=[submit_event])
2023-02-02 21:13:28 +00:00
2023-02-09 20:42:38 +00:00
webui . queue ( concurrency_count = args . concurrency_count )
return webui
2023-02-02 21:13:28 +00:00
if __name__ == " __main__ " :
2023-02-09 20:42:38 +00:00
args = setup_args ( )
2023-02-06 21:43:01 +00:00
2023-02-09 20:42:38 +00:00
if args . listen_path is not None and args . listen_path != " / " :
import uvicorn
2023-02-10 03:02:09 +00:00
uvicorn . run ( " app:app " , host = args . listen_host , port = args . listen_port if not None else 8000 )
2023-02-09 20:42:38 +00:00
else :
2023-02-09 20:49:22 +00:00
webui = setup_gradio ( )
2023-02-09 21:07:51 +00:00
webui . launch ( share = args . share , prevent_thread_lock = True , server_name = args . listen_host , server_port = args . listen_port )
2023-02-09 20:42:38 +00:00
tts = setup_tortoise ( )
2023-02-06 21:43:01 +00:00
2023-02-09 20:42:38 +00:00
webui . block_thread ( )
elif __name__ == " app " :
import sys
from fastapi import FastAPI
2023-02-04 01:50:57 +00:00
2023-02-09 20:42:38 +00:00
sys . argv = [ sys . argv [ 0 ] ]
2023-02-07 21:44:16 +00:00
2023-02-09 20:42:38 +00:00
app = FastAPI ( )
args = setup_args ( )
webui = setup_gradio ( )
app = gr . mount_gradio_app ( app , webui , path = args . listen_path )
2023-02-04 01:50:57 +00:00
2023-02-09 20:42:38 +00:00
tts = setup_tortoise ( )