2023-02-02 21:13:28 +00:00
import os
import argparse
import time
2023-02-06 14:17:41 +00:00
import json
2023-02-06 16:32:09 +00:00
import base64
2023-02-06 21:43:01 +00:00
import re
import urllib . request
import torch
import torchaudio
import music_tag
import gradio as gr
2023-02-07 21:44:16 +00:00
import gradio . utils
2023-02-04 01:50:57 +00:00
2023-02-02 21:13:28 +00:00
from datetime import datetime
2023-02-06 21:43:01 +00:00
2023-02-10 22:12:37 +00:00
import tortoise . api
2023-02-02 21:13:28 +00:00
from tortoise . api import TextToSpeech
2023-02-10 20:11:56 +00:00
from tortoise . utils . audio import load_audio , load_voice , load_voices , get_voice_dir
2023-02-05 06:17:51 +00:00
from tortoise . utils . text import split_and_recombine_text
2023-02-02 21:13:28 +00:00
2023-02-11 15:02:11 +00:00
voicefixer = None
2023-02-10 16:47:57 +00:00
def generate (
text ,
delimiter ,
emotion ,
prompt ,
voice ,
mic_audio ,
seed ,
candidates ,
num_autoregressive_samples ,
diffusion_iterations ,
temperature ,
diffusion_sampler ,
breathing_room ,
cvvp_weight ,
top_p ,
diffusion_temperature ,
length_penalty ,
repetition_penalty ,
cond_free_k ,
experimental_checkboxes ,
progress = gr . Progress ( track_tqdm = True )
) :
2023-02-11 12:39:16 +00:00
global args
global tts
2023-02-10 03:02:09 +00:00
try :
tts
except NameError :
raise gr . Error ( " TTS is still initializing... " )
2023-02-03 01:25:03 +00:00
if voice != " microphone " :
2023-02-02 21:13:28 +00:00
voices = [ voice ]
else :
voices = [ ]
2023-02-03 01:25:03 +00:00
if voice == " microphone " :
2023-02-02 21:13:28 +00:00
if mic_audio is None :
2023-02-03 01:25:03 +00:00
raise gr . Error ( " Please provide audio from mic when choosing `microphone` as a voice input " )
2023-02-07 18:34:29 +00:00
mic = load_audio ( mic_audio , tts . input_sample_rate )
2023-02-04 01:50:57 +00:00
voice_samples , conditioning_latents = [ mic ] , None
2023-02-02 21:13:28 +00:00
else :
2023-02-05 17:59:13 +00:00
progress ( 0 , desc = " Loading voice... " )
2023-02-04 01:50:57 +00:00
voice_samples , conditioning_latents = load_voice ( voice )
2023-02-09 01:53:25 +00:00
2023-02-04 01:50:57 +00:00
if voice_samples is not None :
2023-02-10 03:02:09 +00:00
sample_voice = voice_samples [ 0 ] . squeeze ( ) . cpu ( )
2023-02-08 14:14:28 +00:00
conditioning_latents = tts . get_conditioning_latents ( voice_samples , return_mels = not args . latents_lean_and_mean , progress = progress , max_chunk_size = args . cond_latent_max_chunk_size )
if len ( conditioning_latents ) == 4 :
conditioning_latents = ( conditioning_latents [ 0 ] , conditioning_latents [ 1 ] , conditioning_latents [ 2 ] , None )
2023-02-06 16:32:09 +00:00
if voice != " microphone " :
2023-02-11 12:39:16 +00:00
torch . save ( conditioning_latents , f ' { get_voice_dir ( ) } / { voice } /cond_latents.pth ' )
2023-02-04 01:50:57 +00:00
voice_samples = None
else :
sample_voice = None
2023-02-02 21:13:28 +00:00
2023-02-03 01:25:03 +00:00
if seed == 0 :
seed = None
2023-02-07 20:55:56 +00:00
if conditioning_latents is not None and len ( conditioning_latents ) == 2 and cvvp_weight > 0 :
print ( " Requesting weighing against CVVP weight, but voice latents are missing some extra data. Please regenerate your voice latents. " )
cvvp_weight = 0
2023-02-03 04:20:01 +00:00
2023-02-04 01:50:57 +00:00
settings = {
2023-02-10 16:47:57 +00:00
' temperature ' : float ( temperature ) ,
' top_p ' : float ( top_p ) ,
' diffusion_temperature ' : float ( diffusion_temperature ) ,
' length_penalty ' : float ( length_penalty ) ,
' repetition_penalty ' : float ( repetition_penalty ) ,
' cond_free_k ' : float ( cond_free_k ) ,
2023-02-04 01:50:57 +00:00
2023-02-05 05:51:57 +00:00
' num_autoregressive_samples ' : num_autoregressive_samples ,
2023-02-06 22:31:06 +00:00
' sample_batch_size ' : args . sample_batch_size ,
2023-02-05 05:51:57 +00:00
' diffusion_iterations ' : diffusion_iterations ,
2023-02-04 01:50:57 +00:00
' voice_samples ' : voice_samples ,
' conditioning_latents ' : conditioning_latents ,
' use_deterministic_seed ' : seed ,
' return_deterministic_state ' : True ,
' k ' : candidates ,
2023-02-05 01:28:31 +00:00
' diffusion_sampler ' : diffusion_sampler ,
2023-02-05 14:45:51 +00:00
' breathing_room ' : breathing_room ,
2023-02-04 01:50:57 +00:00
' progress ' : progress ,
2023-02-10 16:47:57 +00:00
' half_p ' : " Half Precision " in experimental_checkboxes ,
' cond_free ' : " Conditioning-Free " in experimental_checkboxes ,
2023-02-07 20:55:56 +00:00
' cvvp_amount ' : cvvp_weight ,
2023-02-04 01:50:57 +00:00
}
2023-02-05 03:53:46 +00:00
2023-02-05 06:17:51 +00:00
if delimiter == " \\ n " :
delimiter = " \n "
2023-02-02 21:13:28 +00:00
2023-02-05 06:17:51 +00:00
if delimiter != " " and delimiter in text :
texts = text . split ( delimiter )
else :
texts = split_and_recombine_text ( text )
2023-02-11 13:00:39 +00:00
full_start_time = time . time ( )
2023-02-05 06:17:51 +00:00
2023-02-10 22:12:37 +00:00
outdir = f " ./results/ { voice } / "
2023-02-02 21:13:28 +00:00
os . makedirs ( outdir , exist_ok = True )
2023-02-05 06:35:32 +00:00
audio_cache = { }
2023-02-10 03:02:09 +00:00
2023-02-11 15:02:11 +00:00
resample = None
# not a ternary in the event for some reason I want to rely on librosa's upsampling interpolator rather than torchaudio's, for some reason
if tts . output_sample_rate != args . output_sample_rate :
resampler = torchaudio . transforms . Resample (
tts . output_sample_rate ,
args . output_sample_rate ,
lowpass_filter_width = 16 ,
rolloff = 0.85 ,
resampling_method = " kaiser_window " ,
beta = 8.555504641634386 ,
)
2023-02-10 03:02:09 +00:00
volume_adjust = torchaudio . transforms . Vol ( gain = args . output_volume , gain_type = " amplitude " ) if args . output_volume != 1 else None
2023-02-10 22:12:37 +00:00
idx = 0
for i , file in enumerate ( os . listdir ( outdir ) ) :
2023-02-10 22:51:56 +00:00
if file [ - 5 : ] == " .json " :
2023-02-10 22:12:37 +00:00
idx = idx + 1
2023-02-11 12:39:16 +00:00
if idx :
idx = idx + 1
2023-02-10 22:12:37 +00:00
def get_name ( line = 0 , candidate = 0 , combined = False ) :
name = f " { idx } "
2023-02-11 15:02:11 +00:00
if combined :
name = f " { name } _combined "
elif len ( texts ) > 1 :
2023-02-10 22:12:37 +00:00
name = f " { name } _ { line } "
if candidates > 1 :
name = f " { name } _ { candidate } "
return name
2023-02-05 06:17:51 +00:00
for line , cut_text in enumerate ( texts ) :
2023-02-06 21:43:01 +00:00
if emotion == " Custom " :
if prompt . strip ( ) != " " :
cut_text = f " [ { prompt } ,] { cut_text } "
else :
2023-02-05 06:55:09 +00:00
cut_text = f " [I am really { emotion . lower ( ) } ,] { cut_text } "
2023-02-11 15:02:11 +00:00
progress . msg_prefix = f ' [ { str ( line + 1 ) } / { str ( len ( texts ) ) } ] '
print ( f " { progress . msg_prefix } Generating line: { cut_text } " )
2023-02-05 06:17:51 +00:00
2023-02-11 13:00:39 +00:00
start_time = time . time ( )
2023-02-05 06:17:51 +00:00
gen , additionals = tts . tts ( cut_text , * * settings )
seed = additionals [ 0 ]
2023-02-11 13:00:39 +00:00
run_time = time . time ( ) - start_time
2023-02-11 15:02:11 +00:00
print ( f " Generating line took { run_time } seconds " )
2023-02-05 06:17:51 +00:00
if isinstance ( gen , list ) :
for j , g in enumerate ( gen ) :
2023-02-10 22:12:37 +00:00
name = get_name ( line = line , candidate = j )
audio_cache [ name ] = {
2023-02-10 03:02:09 +00:00
' audio ' : g ,
2023-02-06 14:17:41 +00:00
' text ' : cut_text ,
2023-02-11 13:00:39 +00:00
' time ' : run_time
2023-02-06 14:17:41 +00:00
}
2023-02-05 06:17:51 +00:00
else :
2023-02-10 22:12:37 +00:00
name = get_name ( line = line )
audio_cache [ name ] = {
2023-02-10 03:02:09 +00:00
' audio ' : gen ,
2023-02-06 14:17:41 +00:00
' text ' : cut_text ,
2023-02-11 13:00:39 +00:00
' time ' : run_time ,
2023-02-06 14:17:41 +00:00
}
2023-02-10 03:02:09 +00:00
for k in audio_cache :
audio = audio_cache [ k ] [ ' audio ' ] . squeeze ( 0 ) . cpu ( )
if resampler is not None :
audio = resampler ( audio )
if volume_adjust is not None :
audio = volume_adjust ( audio )
audio_cache [ k ] [ ' audio ' ] = audio
2023-02-10 22:12:37 +00:00
torchaudio . save ( f ' { outdir } / { voice } _ { k } .wav ' , audio , args . output_sample_rate )
2023-02-05 06:17:51 +00:00
2023-02-05 06:35:32 +00:00
output_voice = None
2023-02-11 01:45:25 +00:00
output_voices = [ ]
2023-02-05 06:35:32 +00:00
if len ( texts ) > 1 :
for candidate in range ( candidates ) :
audio_clips = [ ]
for line in range ( len ( texts ) ) :
2023-02-11 15:02:11 +00:00
name = get_name ( line = line , candidate = candidate )
audio = audio_cache [ name ] [ ' audio ' ]
2023-02-06 14:17:41 +00:00
audio_clips . append ( audio )
2023-02-05 06:35:32 +00:00
2023-02-11 15:02:11 +00:00
name = get_name ( candidate = candidate , combined = True )
2023-02-07 00:26:22 +00:00
audio = torch . cat ( audio_clips , dim = - 1 )
2023-02-10 22:12:37 +00:00
torchaudio . save ( f ' { outdir } / { voice } _ { name } .wav ' , audio , args . output_sample_rate )
2023-02-06 21:43:01 +00:00
2023-02-07 00:26:22 +00:00
audio = audio . squeeze ( 0 ) . cpu ( )
2023-02-10 22:12:37 +00:00
audio_cache [ name ] = {
2023-02-06 21:43:01 +00:00
' audio ' : audio ,
2023-02-11 01:45:25 +00:00
' text ' : text ,
2023-02-11 13:00:39 +00:00
' time ' : time . time ( ) - full_start_time
2023-02-06 21:43:01 +00:00
}
2023-02-11 01:45:25 +00:00
output_voices . append ( f ' { outdir } / { voice } _ { name } .wav ' )
2023-02-05 06:35:32 +00:00
if output_voice is None :
2023-02-10 22:12:37 +00:00
output_voice = f ' { outdir } / { voice } _ { name } .wav '
2023-02-05 06:35:32 +00:00
else :
2023-02-11 15:02:11 +00:00
for candidate in range ( candidates ) :
name = get_name ( candidate = candidate )
2023-02-11 01:45:25 +00:00
output_voices . append ( f ' { outdir } / { voice } _ { name } .wav ' )
2023-02-06 14:17:41 +00:00
info = {
' text ' : text ,
2023-02-06 16:00:44 +00:00
' delimiter ' : ' \\ n ' if delimiter == " \n " else delimiter ,
2023-02-06 14:17:41 +00:00
' emotion ' : emotion ,
' prompt ' : prompt ,
' voice ' : voice ,
' mic_audio ' : mic_audio ,
' seed ' : seed ,
' candidates ' : candidates ,
' num_autoregressive_samples ' : num_autoregressive_samples ,
' diffusion_iterations ' : diffusion_iterations ,
' temperature ' : temperature ,
' diffusion_sampler ' : diffusion_sampler ,
' breathing_room ' : breathing_room ,
2023-02-07 20:55:56 +00:00
' cvvp_weight ' : cvvp_weight ,
2023-02-10 16:47:57 +00:00
' top_p ' : top_p ,
' diffusion_temperature ' : diffusion_temperature ,
' length_penalty ' : length_penalty ,
' repetition_penalty ' : repetition_penalty ,
' cond_free_k ' : cond_free_k ,
' experimentals ' : experimental_checkboxes ,
2023-02-11 13:00:39 +00:00
' time ' : time . time ( ) - full_start_time ,
2023-02-06 14:17:41 +00:00
}
2023-02-05 06:17:51 +00:00
2023-02-10 22:12:37 +00:00
with open ( f ' { outdir } /input_ { idx } .json ' , ' w ' , encoding = " utf-8 " ) as f :
2023-02-06 14:17:41 +00:00
f . write ( json . dumps ( info , indent = ' \t ' ) )
2023-02-02 21:13:28 +00:00
2023-02-06 16:32:09 +00:00
if voice is not None and conditioning_latents is not None :
2023-02-11 12:39:16 +00:00
with open ( f ' { get_voice_dir ( ) } / { voice } /cond_latents.pth ' , ' rb ' ) as f :
2023-02-06 16:32:09 +00:00
info [ ' latents ' ] = base64 . b64encode ( f . read ( ) ) . decode ( " ascii " )
2023-02-04 01:50:57 +00:00
2023-02-11 15:02:11 +00:00
if voicefixer :
# we could do this on the pieces before they get stiched up anyways to save some compute
# but the stitching would need to read back from disk, defeating the point of caching the waveform
for path in progress . tqdm ( audio_cache , desc = " Running voicefix... " ) :
print ( " VoiceFix starting " )
voicefixer . restore (
input = f ' { outdir } / { voice } _ { k } .wav ' ,
output = f ' { outdir } / { voice } _ { k } .wav ' ,
#cuda=False,
#mode=mode,
)
print ( " VoiceFix finished " )
2023-02-08 14:14:28 +00:00
if args . embed_output_metadata :
2023-02-11 15:02:11 +00:00
for path in progress . tqdm ( audio_cache , desc = " Embedding metadata... " ) :
2023-02-08 14:14:28 +00:00
info [ ' text ' ] = audio_cache [ path ] [ ' text ' ]
2023-02-11 13:00:39 +00:00
info [ ' time ' ] = audio_cache [ path ] [ ' time ' ]
2023-02-06 14:17:41 +00:00
2023-02-10 22:12:37 +00:00
metadata = music_tag . load_file ( f " { outdir } / { voice } _ { path } .wav " )
2023-02-08 14:14:28 +00:00
metadata [ ' lyrics ' ] = json . dumps ( info )
metadata . save ( )
2023-02-10 03:02:09 +00:00
#if output_voice is not None:
# output_voice = (args.output_sample_rate, output_voice.numpy())
2023-02-05 06:17:51 +00:00
2023-02-04 01:50:57 +00:00
if sample_voice is not None :
2023-02-10 03:02:09 +00:00
sample_voice = ( tts . input_sample_rate , sample_voice . numpy ( ) )
2023-02-05 06:17:51 +00:00
2023-02-11 12:39:16 +00:00
if output_voice is None and len ( output_voices ) :
output_voice = output_voices [ 0 ]
print ( f " Generation took { info [ ' time ' ] } seconds, saved to ' { output_voice } ' \n " )
2023-02-06 21:43:01 +00:00
info [ ' seed ' ] = settings [ ' use_deterministic_seed ' ]
del info [ ' latents ' ]
with open ( f ' ./config/generate.json ' , ' w ' , encoding = " utf-8 " ) as f :
f . write ( json . dumps ( info , indent = ' \t ' ) )
2023-02-11 01:45:25 +00:00
results = [
[ seed , " {:.3f} " . format ( info [ ' time ' ] ) ]
]
2023-02-04 01:50:57 +00:00
return (
sample_voice ,
2023-02-11 12:39:16 +00:00
output_voice ,
2023-02-11 01:45:25 +00:00
results ,
2023-02-04 01:50:57 +00:00
)
2023-02-05 03:53:46 +00:00
def update_presets ( value ) :
PRESETS = {
' Ultra Fast ' : { ' num_autoregressive_samples ' : 16 , ' diffusion_iterations ' : 30 , ' cond_free ' : False } ,
' Fast ' : { ' num_autoregressive_samples ' : 96 , ' diffusion_iterations ' : 80 } ,
' Standard ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 200 } ,
' High Quality ' : { ' num_autoregressive_samples ' : 256 , ' diffusion_iterations ' : 400 } ,
}
if value in PRESETS :
preset = PRESETS [ value ]
return ( gr . update ( value = preset [ ' num_autoregressive_samples ' ] ) , gr . update ( value = preset [ ' diffusion_iterations ' ] ) )
else :
return ( gr . update ( ) , gr . update ( ) )
2023-02-02 21:13:28 +00:00
2023-02-10 19:55:33 +00:00
def read_generate_settings ( file , save_latents = True , save_as_temp = True ) :
2023-02-06 16:00:44 +00:00
j = None
2023-02-06 16:32:09 +00:00
latents = None
2023-02-06 16:00:44 +00:00
if file is not None :
2023-02-06 21:43:01 +00:00
if hasattr ( file , ' name ' ) :
2023-02-11 01:45:25 +00:00
file = file . name
if file [ - 4 : ] == " .wav " :
metadata = music_tag . load_file ( file )
2023-02-06 21:43:01 +00:00
if ' lyrics ' in metadata :
j = json . loads ( str ( metadata [ ' lyrics ' ] ) )
elif file [ - 5 : ] == " .json " :
with open ( file , ' r ' ) as f :
j = json . load ( f )
if ' latents ' in j and save_latents :
latents = base64 . b64decode ( j [ ' latents ' ] )
del j [ ' latents ' ]
2023-02-06 16:32:09 +00:00
if latents and save_latents :
2023-02-11 12:39:16 +00:00
outdir = f ' { get_voice_dir ( ) } / { " .temp " if save_as_temp else j [ " voice " ] } / '
2023-02-06 21:43:01 +00:00
os . makedirs ( outdir , exist_ok = True )
with open ( f ' { outdir } /cond_latents.pth ' , ' wb ' ) as f :
2023-02-06 16:32:09 +00:00
f . write ( latents )
2023-02-06 21:43:01 +00:00
latents = f ' { outdir } /cond_latents.pth '
2023-02-06 16:32:09 +00:00
2023-02-11 01:45:25 +00:00
if " time " in j :
j [ " time " ] = " {:.3f} " . format ( j [ " time " ] )
2023-02-06 16:32:09 +00:00
return (
j ,
latents
)
2023-02-10 19:55:33 +00:00
def save_latents ( file ) :
read_generate_settings ( file , save_latents = True , save_as_temp = False )
2023-02-06 16:00:44 +00:00
2023-02-06 21:43:01 +00:00
def import_generate_settings ( file = " ./config/generate.json " ) :
settings , _ = read_generate_settings ( file , save_latents = False )
2023-02-06 16:32:09 +00:00
2023-02-06 21:43:01 +00:00
if settings is None :
2023-02-06 16:00:44 +00:00
return None
return (
2023-02-07 20:55:56 +00:00
None if ' text ' not in settings else settings [ ' text ' ] ,
None if ' delimiter ' not in settings else settings [ ' delimiter ' ] ,
None if ' emotion ' not in settings else settings [ ' emotion ' ] ,
None if ' prompt ' not in settings else settings [ ' prompt ' ] ,
None if ' voice ' not in settings else settings [ ' voice ' ] ,
None if ' mic_audio ' not in settings else settings [ ' mic_audio ' ] ,
None if ' seed ' not in settings else settings [ ' seed ' ] ,
None if ' candidates ' not in settings else settings [ ' candidates ' ] ,
None if ' num_autoregressive_samples ' not in settings else settings [ ' num_autoregressive_samples ' ] ,
None if ' diffusion_iterations ' not in settings else settings [ ' diffusion_iterations ' ] ,
2023-02-10 16:47:57 +00:00
0.8 if ' temperature ' not in settings else settings [ ' temperature ' ] ,
" DDIM " if ' diffusion_sampler ' not in settings else settings [ ' diffusion_sampler ' ] ,
2023-02-10 19:55:33 +00:00
8 if ' breathing_room ' not in settings else settings [ ' breathing_room ' ] ,
2023-02-10 16:47:57 +00:00
0.0 if ' cvvp_weight ' not in settings else settings [ ' cvvp_weight ' ] ,
0.8 if ' top_p ' not in settings else settings [ ' top_p ' ] ,
1.0 if ' diffusion_temperature ' not in settings else settings [ ' diffusion_temperature ' ] ,
1.0 if ' length_penalty ' not in settings else settings [ ' length_penalty ' ] ,
2.0 if ' repetition_penalty ' not in settings else settings [ ' repetition_penalty ' ] ,
2.0 if ' cond_free_k ' not in settings else settings [ ' cond_free_k ' ] ,
2023-02-07 20:55:56 +00:00
None if ' experimentals ' not in settings else settings [ ' experimentals ' ] ,
2023-02-06 16:00:44 +00:00
)
2023-02-06 21:43:01 +00:00
def curl ( url ) :
try :
req = urllib . request . Request ( url , headers = { ' User-Agent ' : ' Python ' } )
conn = urllib . request . urlopen ( req )
data = conn . read ( )
data = data . decode ( )
data = json . loads ( data )
conn . close ( )
return data
except Exception as e :
print ( e )
return None
def check_for_updates ( ) :
if not os . path . isfile ( ' ./.git/FETCH_HEAD ' ) :
print ( " Cannot check for updates: not from a git repo " )
return False
with open ( f ' ./.git/FETCH_HEAD ' , ' r ' , encoding = " utf-8 " ) as f :
head = f . read ( )
match = re . findall ( r " ^([a-f0-9]+).+?https: \ / \ /(.+?) \ /(.+?) \ /(.+?) \ n " , head )
if match is None or len ( match ) == 0 :
print ( " Cannot check for updates: cannot parse FETCH_HEAD " )
return False
match = match [ 0 ]
local = match [ 0 ]
host = match [ 1 ]
owner = match [ 2 ]
repo = match [ 3 ]
res = curl ( f " https:// { host } /api/v1/repos/ { owner } / { repo } /branches/ " ) #this only works for gitea instances
if res is None or len ( res ) == 0 :
print ( " Cannot check for updates: cannot fetch from remote " )
return False
remote = res [ 0 ] [ " commit " ] [ " id " ]
if remote != local :
print ( f " New version found: { local [ : 8 ] } => { remote [ : 8 ] } " )
return True
return False
2023-02-10 03:02:09 +00:00
def reload_tts ( ) :
tts = setup_tortoise ( )
2023-02-10 22:12:37 +00:00
def cancel_generate ( ) :
tortoise . api . STOP_SIGNAL = True
2023-02-05 17:59:13 +00:00
def update_voices ( ) :
2023-02-10 20:11:56 +00:00
return gr . Dropdown . update ( choices = sorted ( os . listdir ( get_voice_dir ( ) ) ) + [ " microphone " ] )
2023-02-06 21:43:01 +00:00
2023-02-11 15:02:11 +00:00
def export_exec_settings ( share , listen , check_for_updates , models_from_local_only , low_vram , embed_output_metadata , latents_lean_and_mean , voice_fixer , cond_latent_max_chunk_size , sample_batch_size , concurrency_count , output_sample_rate , output_volume ) :
2023-02-06 21:43:01 +00:00
args . share = share
2023-02-09 21:07:51 +00:00
args . listen = listen
2023-02-06 21:43:01 +00:00
args . low_vram = low_vram
args . check_for_updates = check_for_updates
2023-02-09 22:06:55 +00:00
args . models_from_local_only = models_from_local_only
2023-02-06 21:43:01 +00:00
args . cond_latent_max_chunk_size = cond_latent_max_chunk_size
2023-02-06 22:31:06 +00:00
args . sample_batch_size = sample_batch_size
2023-02-08 14:14:28 +00:00
args . embed_output_metadata = embed_output_metadata
args . latents_lean_and_mean = latents_lean_and_mean
2023-02-11 15:02:11 +00:00
args . voice_fixer = voice_fixer
2023-02-06 21:43:01 +00:00
args . concurrency_count = concurrency_count
2023-02-10 03:02:09 +00:00
args . output_sample_rate = output_sample_rate
args . output_volume = output_volume
2023-02-06 21:43:01 +00:00
settings = {
' share ' : args . share ,
2023-02-09 21:07:51 +00:00
' listen ' : args . listen ,
2023-02-06 21:43:01 +00:00
' low-vram ' : args . low_vram ,
' check-for-updates ' : args . check_for_updates ,
2023-02-09 22:06:55 +00:00
' models-from-local-only ' : args . models_from_local_only ,
2023-02-06 21:43:01 +00:00
' cond-latent-max-chunk-size ' : args . cond_latent_max_chunk_size ,
2023-02-06 22:31:06 +00:00
' sample-batch-size ' : args . sample_batch_size ,
2023-02-08 14:14:28 +00:00
' embed-output-metadata ' : args . embed_output_metadata ,
' latents-lean-and-mean ' : args . latents_lean_and_mean ,
2023-02-11 15:02:11 +00:00
' voice-fixer ' : args . voice_fixer ,
2023-02-06 21:43:01 +00:00
' concurrency-count ' : args . concurrency_count ,
2023-02-10 03:02:09 +00:00
' output-sample-rate ' : args . output_sample_rate ,
' output-volume ' : args . output_volume ,
2023-02-06 21:43:01 +00:00
}
with open ( f ' ./config/exec.json ' , ' w ' , encoding = " utf-8 " ) as f :
f . write ( json . dumps ( settings , indent = ' \t ' ) )
2023-02-09 20:42:38 +00:00
def setup_args ( ) :
default_arguments = {
' share ' : False ,
2023-02-09 21:07:51 +00:00
' listen ' : None ,
2023-02-09 20:42:38 +00:00
' check-for-updates ' : False ,
2023-02-09 22:06:55 +00:00
' models-from-local-only ' : False ,
2023-02-09 20:42:38 +00:00
' low-vram ' : False ,
' sample-batch-size ' : None ,
' embed-output-metadata ' : True ,
' latents-lean-and-mean ' : True ,
2023-02-11 15:02:11 +00:00
' voice-fixer ' : True ,
2023-02-09 20:42:38 +00:00
' cond-latent-max-chunk-size ' : 1000000 ,
2023-02-10 03:02:09 +00:00
' concurrency-count ' : 2 ,
' output-sample-rate ' : 44100 ,
' output-volume ' : 1 ,
2023-02-09 20:42:38 +00:00
}
if os . path . isfile ( ' ./config/exec.json ' ) :
with open ( f ' ./config/exec.json ' , ' r ' , encoding = " utf-8 " ) as f :
overrides = json . load ( f )
for k in overrides :
default_arguments [ k ] = overrides [ k ]
parser = argparse . ArgumentParser ( )
parser . add_argument ( " --share " , action = ' store_true ' , default = default_arguments [ ' share ' ] , help = " Lets Gradio return a public URL to use anywhere " )
2023-02-09 21:07:51 +00:00
parser . add_argument ( " --listen " , default = default_arguments [ ' listen ' ] , help = " Path for Gradio to listen on " )
2023-02-09 20:42:38 +00:00
parser . add_argument ( " --check-for-updates " , action = ' store_true ' , default = default_arguments [ ' check-for-updates ' ] , help = " Checks for update on startup " )
2023-02-09 22:06:55 +00:00
parser . add_argument ( " --models-from-local-only " , action = ' store_true ' , default = default_arguments [ ' models-from-local-only ' ] , help = " Only loads models from disk, does not check for updates for models " )
2023-02-09 20:42:38 +00:00
parser . add_argument ( " --low-vram " , action = ' store_true ' , default = default_arguments [ ' low-vram ' ] , help = " Disables some optimizations that increases VRAM usage " )
parser . add_argument ( " --no-embed-output-metadata " , action = ' store_false ' , default = not default_arguments [ ' embed-output-metadata ' ] , help = " Disables embedding output metadata into resulting WAV files for easily fetching its settings used with the web UI (data is stored in the lyrics metadata tag) " )
parser . add_argument ( " --latents-lean-and-mean " , action = ' store_true ' , default = default_arguments [ ' latents-lean-and-mean ' ] , help = " Exports the bare essentials for latents. " )
2023-02-11 15:02:11 +00:00
parser . add_argument ( " --voice-fixer " , action = ' store_true ' , default = default_arguments [ ' voice-fixer ' ] , help = " Uses python module ' voicefixer ' to improve audio quality, if available. " )
2023-02-09 20:42:38 +00:00
parser . add_argument ( " --cond-latent-max-chunk-size " , default = default_arguments [ ' cond-latent-max-chunk-size ' ] , type = int , help = " Sets an upper limit to audio chunk size when computing conditioning latents " )
parser . add_argument ( " --sample-batch-size " , default = default_arguments [ ' sample-batch-size ' ] , type = int , help = " Sets an upper limit to audio chunk size when computing conditioning latents " )
parser . add_argument ( " --concurrency-count " , type = int , default = default_arguments [ ' concurrency-count ' ] , help = " How many Gradio events to process at once " )
2023-02-10 03:02:09 +00:00
parser . add_argument ( " --output-sample-rate " , type = int , default = default_arguments [ ' output-sample-rate ' ] , help = " Sample rate to resample the output to (from 24KHz) " )
parser . add_argument ( " --output-volume " , type = float , default = default_arguments [ ' output-volume ' ] , help = " Adjusts volume of output " )
2023-02-09 20:42:38 +00:00
args = parser . parse_args ( )
args . embed_output_metadata = not args . no_embed_output_metadata
2023-02-09 21:07:51 +00:00
args . listen_host = None
args . listen_port = None
args . listen_path = None
2023-02-10 15:58:56 +00:00
if args . listen :
2023-02-09 21:07:51 +00:00
match = re . findall ( r " ^(?:(.+?):( \ d+))?( \ /.+?)?$ " , args . listen ) [ 0 ]
args . listen_host = match [ 0 ] if match [ 0 ] != " " else " 127.0.0.1 "
2023-02-10 03:02:09 +00:00
args . listen_port = match [ 1 ] if match [ 1 ] != " " else None
2023-02-09 21:07:51 +00:00
args . listen_path = match [ 2 ] if match [ 2 ] != " " else " / "
if args . listen_port is not None :
args . listen_port = int ( args . listen_port )
2023-02-09 20:42:38 +00:00
return args
def setup_tortoise ( ) :
2023-02-11 12:39:16 +00:00
global args
2023-02-11 15:02:11 +00:00
global voicefixer
if args . voice_fixer :
try :
from voicefixer import VoiceFixer
print ( " Initializating voice-fixer " )
voicefixer = VoiceFixer ( )
print ( " initialized voice-fixer " )
except Exception as e :
pass
2023-02-09 20:42:38 +00:00
print ( " Initializating TorToiSe... " )
tts = TextToSpeech ( minor_optimizations = not args . low_vram )
print ( " TorToiSe initialized, ready for generation. " )
return tts
def setup_gradio ( ) :
2023-02-11 12:39:16 +00:00
global args
2023-02-09 20:42:38 +00:00
if not args . share :
def noop ( function , return_value = None ) :
def wrapped ( * args , * * kwargs ) :
return return_value
return wrapped
gradio . utils . version_check = noop ( gradio . utils . version_check )
gradio . utils . initiated_analytics = noop ( gradio . utils . initiated_analytics )
gradio . utils . launch_analytics = noop ( gradio . utils . launch_analytics )
gradio . utils . integration_analytics = noop ( gradio . utils . integration_analytics )
gradio . utils . error_analytics = noop ( gradio . utils . error_analytics )
gradio . utils . log_feature_analytics = noop ( gradio . utils . log_feature_analytics )
#gradio.utils.get_local_ip_address = noop(gradio.utils.get_local_ip_address, 'localhost')
2023-02-05 17:59:13 +00:00
2023-02-09 22:06:55 +00:00
if args . models_from_local_only :
os . environ [ ' TRANSFORMERS_OFFLINE ' ] = ' 1 '
2023-02-06 16:00:44 +00:00
with gr . Blocks ( ) as webui :
with gr . Tab ( " Generate " ) :
with gr . Row ( ) :
with gr . Column ( ) :
text = gr . Textbox ( lines = 4 , label = " Prompt " )
delimiter = gr . Textbox ( lines = 1 , label = " Line Delimiter " , placeholder = " \\ n " )
emotion = gr . Radio (
2023-02-06 21:43:01 +00:00
[ " Happy " , " Sad " , " Angry " , " Disgusted " , " Arrogant " , " Custom " ] ,
value = " Custom " ,
2023-02-06 16:00:44 +00:00
label = " Emotion " ,
type = " value " ,
interactive = True
)
prompt = gr . Textbox ( lines = 1 , label = " Custom Emotion + Prompt (if selected) " )
voice = gr . Dropdown (
2023-02-10 20:11:56 +00:00
sorted ( os . listdir ( get_voice_dir ( ) ) ) + [ " microphone " ] ,
2023-02-06 16:00:44 +00:00
label = " Voice " ,
type = " value " ,
)
mic_audio = gr . Audio (
label = " Microphone Source " ,
source = " microphone " ,
type = " filepath " ,
)
refresh_voices = gr . Button ( value = " Refresh Voice List " )
refresh_voices . click ( update_voices ,
inputs = None ,
outputs = voice
)
prompt . change ( fn = lambda value : gr . update ( value = " Custom " ) ,
inputs = prompt ,
outputs = emotion
)
mic_audio . change ( fn = lambda value : gr . update ( value = " microphone " ) ,
inputs = mic_audio ,
outputs = voice
)
with gr . Column ( ) :
candidates = gr . Slider ( value = 1 , minimum = 1 , maximum = 6 , step = 1 , label = " Candidates " )
seed = gr . Number ( value = 0 , precision = 0 , label = " Seed " )
preset = gr . Radio (
2023-02-06 21:43:01 +00:00
[ " Ultra Fast " , " Fast " , " Standard " , " High Quality " ] ,
2023-02-06 16:00:44 +00:00
label = " Preset " ,
type = " value " ,
)
num_autoregressive_samples = gr . Slider ( value = 128 , minimum = 0 , maximum = 512 , step = 1 , label = " Samples " )
diffusion_iterations = gr . Slider ( value = 128 , minimum = 0 , maximum = 512 , step = 1 , label = " Iterations " )
temperature = gr . Slider ( value = 0.2 , minimum = 0 , maximum = 1 , step = 0.1 , label = " Temperature " )
2023-02-07 00:26:22 +00:00
breathing_room = gr . Slider ( value = 8 , minimum = 1 , maximum = 32 , step = 1 , label = " Pause Size " )
2023-02-06 16:00:44 +00:00
diffusion_sampler = gr . Radio (
[ " P " , " DDIM " ] , # + ["K_Euler_A", "DPM++2M"],
value = " P " ,
label = " Diffusion Samplers " ,
type = " value " ,
)
preset . change ( fn = update_presets ,
inputs = preset ,
outputs = [
num_autoregressive_samples ,
diffusion_iterations ,
] ,
)
with gr . Column ( ) :
selected_voice = gr . Audio ( label = " Source Sample " )
output_audio = gr . Audio ( label = " Output " )
2023-02-11 01:45:25 +00:00
generation_results = gr . Dataframe ( label = " Results " , headers = [ " Seed " , " Time " ] )
2023-02-06 16:00:44 +00:00
submit = gr . Button ( value = " Generate " )
2023-02-10 22:12:37 +00:00
stop = gr . Button ( value = " Stop " )
2023-02-11 01:45:25 +00:00
with gr . Tab ( " History " ) :
with gr . Row ( ) :
with gr . Column ( ) :
headers = {
" Name " : " " ,
" Samples " : " num_autoregressive_samples " ,
" Iterations " : " diffusion_iterations " ,
" Temp. " : " temperature " ,
" Sampler " : " diffusion_sampler " ,
" CVVP " : " cvvp_weight " ,
" Top P " : " top_p " ,
" Diff. Temp. " : " diffusion_temperature " ,
" Len Pen " : " length_penalty " ,
" Rep Pen " : " repetition_penalty " ,
" Cond-Free K " : " cond_free_k " ,
" Time " : " time " ,
}
history_info = gr . Dataframe ( label = " Results " , headers = list ( headers . keys ( ) ) )
with gr . Row ( ) :
with gr . Column ( ) :
history_voices = gr . Dropdown (
sorted ( os . listdir ( get_voice_dir ( ) ) ) + [ " microphone " ] ,
label = " Voice " ,
type = " value " ,
)
history_view_results_button = gr . Button ( value = " View Files " )
with gr . Column ( ) :
history_results_list = gr . Dropdown ( label = " Results " , type = " value " , interactive = True )
history_view_result_button = gr . Button ( value = " View File " )
with gr . Column ( ) :
history_audio = gr . Audio ( )
history_copy_settings_button = gr . Button ( value = " Copy Settings " )
def history_view_results ( voice ) :
results = [ ]
files = [ ]
outdir = f " ./results/ { voice } / "
for i , file in enumerate ( os . listdir ( outdir ) ) :
if file [ - 4 : ] != " .wav " :
continue
metadata , _ = read_generate_settings ( f " { outdir } / { file } " , save_latents = False )
if metadata is None :
continue
values = [ ]
for k in headers :
v = file
if k != " Name " :
v = metadata [ headers [ k ] ]
values . append ( v )
files . append ( file )
results . append ( values )
return (
results ,
gr . Dropdown . update ( choices = sorted ( files ) )
)
history_view_results_button . click (
fn = history_view_results ,
inputs = history_voices ,
outputs = [
history_info ,
history_results_list ,
]
)
history_view_result_button . click (
fn = lambda voice , file : f " ./results/ { voice } / { file } " ,
inputs = [
history_voices ,
history_results_list ,
] ,
outputs = history_audio
)
2023-02-06 16:00:44 +00:00
with gr . Tab ( " Utilities " ) :
with gr . Row ( ) :
with gr . Column ( ) :
audio_in = gr . File ( type = " file " , label = " Audio Input " , file_types = [ " audio " ] )
copy_button = gr . Button ( value = " Copy Settings " )
2023-02-10 19:55:33 +00:00
import_voice = gr . Button ( value = " Import Voice " )
2023-02-06 16:00:44 +00:00
with gr . Column ( ) :
metadata_out = gr . JSON ( label = " Audio Metadata " )
2023-02-06 16:32:09 +00:00
latents_out = gr . File ( type = " binary " , label = " Voice Latents " )
2023-02-06 16:00:44 +00:00
audio_in . upload (
2023-02-06 21:43:01 +00:00
fn = read_generate_settings ,
2023-02-06 16:00:44 +00:00
inputs = audio_in ,
2023-02-06 16:32:09 +00:00
outputs = [
metadata_out ,
latents_out
]
2023-02-06 16:00:44 +00:00
)
2023-02-10 19:55:33 +00:00
import_voice . click (
fn = save_latents ,
inputs = audio_in ,
)
2023-02-06 21:43:01 +00:00
with gr . Tab ( " Settings " ) :
with gr . Row ( ) :
2023-02-10 03:02:09 +00:00
exec_inputs = [ ]
with gr . Column ( ) :
exec_inputs = exec_inputs + [
gr . Textbox ( label = " Listen " , value = args . listen , placeholder = " 127.0.0.1:7860/ " ) ,
gr . Checkbox ( label = " Public Share Gradio " , value = args . share ) ,
gr . Checkbox ( label = " Check For Updates " , value = args . check_for_updates ) ,
gr . Checkbox ( label = " Only Load Models Locally " , value = args . models_from_local_only ) ,
gr . Checkbox ( label = " Low VRAM " , value = args . low_vram ) ,
gr . Checkbox ( label = " Embed Output Metadata " , value = args . embed_output_metadata ) ,
gr . Checkbox ( label = " Slimmer Computed Latents " , value = args . latents_lean_and_mean ) ,
2023-02-11 15:02:11 +00:00
gr . Checkbox ( label = " Voice Fixer " , value = args . voice_fixer ) ,
2023-02-10 03:02:09 +00:00
]
gr . Button ( value = " Check for Updates " ) . click ( check_for_updates )
2023-02-10 16:47:57 +00:00
gr . Button ( value = " Reload TTS " ) . click ( reload_tts )
2023-02-10 03:02:09 +00:00
with gr . Column ( ) :
exec_inputs = exec_inputs + [
gr . Number ( label = " Voice Latents Max Chunk Size " , precision = 0 , value = args . cond_latent_max_chunk_size ) ,
gr . Number ( label = " Sample Batch Size " , precision = 0 , value = args . sample_batch_size ) ,
gr . Number ( label = " Concurrency Count " , precision = 0 , value = args . concurrency_count ) ,
gr . Number ( label = " Ouptut Sample Rate " , precision = 0 , value = args . output_sample_rate ) ,
gr . Slider ( label = " Ouptut Volume " , minimum = 0 , maximum = 2 , value = args . output_volume ) ,
]
for i in exec_inputs :
i . change (
fn = export_exec_settings ,
inputs = exec_inputs
)
2023-02-06 21:43:01 +00:00
with gr . Column ( ) :
2023-02-10 16:47:57 +00:00
experimental_checkboxes = gr . CheckboxGroup ( [ " Half Precision " , " Conditioning-Free " ] , value = [ " Conditioning-Free " ] , label = " Experimental Flags " )
2023-02-07 20:55:56 +00:00
cvvp_weight = gr . Slider ( value = 0 , minimum = 0 , maximum = 1 , label = " CVVP Weight " )
2023-02-10 19:55:33 +00:00
top_p = gr . Slider ( value = 0.8 , minimum = 0 , maximum = 1 , label = " Top P " )
diffusion_temperature = gr . Slider ( value = 1.0 , minimum = 0 , maximum = 1 , label = " Diffusion Temperature " )
2023-02-10 16:47:57 +00:00
length_penalty = gr . Slider ( value = 1.0 , minimum = 0 , maximum = 8 , label = " Length Penalty " )
repetition_penalty = gr . Slider ( value = 2.0 , minimum = 0 , maximum = 8 , label = " Repetition Penalty " )
2023-02-10 19:55:33 +00:00
cond_free_k = gr . Slider ( value = 2.0 , minimum = 0 , maximum = 4 , label = " Conditioning-Free K " )
2023-02-06 21:43:01 +00:00
input_settings = [
text ,
delimiter ,
emotion ,
prompt ,
voice ,
mic_audio ,
seed ,
candidates ,
num_autoregressive_samples ,
diffusion_iterations ,
temperature ,
diffusion_sampler ,
breathing_room ,
2023-02-07 20:55:56 +00:00
cvvp_weight ,
2023-02-10 16:47:57 +00:00
top_p ,
diffusion_temperature ,
length_penalty ,
repetition_penalty ,
cond_free_k ,
experimental_checkboxes ,
2023-02-06 21:43:01 +00:00
]
submit_event = submit . click ( generate ,
inputs = input_settings ,
2023-02-11 01:45:25 +00:00
outputs = [ selected_voice , output_audio , generation_results ] ,
2023-02-06 21:43:01 +00:00
)
copy_button . click ( import_generate_settings ,
2023-02-11 01:45:25 +00:00
inputs = audio_in , # JSON elements cannot be used as inputs
outputs = input_settings
)
def history_copy_settings ( voice , file ) :
settings = import_generate_settings ( f " ./results/ { voice } / { file } " )
return settings
history_copy_settings_button . click ( history_copy_settings ,
inputs = [
history_voices ,
history_results_list ,
] ,
2023-02-06 21:43:01 +00:00
outputs = input_settings
)
if os . path . isfile ( ' ./config/generate.json ' ) :
webui . load ( import_generate_settings , inputs = None , outputs = input_settings )
if args . check_for_updates :
webui . load ( check_for_updates )
2023-02-10 22:12:37 +00:00
stop . click ( fn = cancel_generate , inputs = None , outputs = None , cancels = [ submit_event ] )
2023-02-06 21:43:01 +00:00
2023-02-02 21:13:28 +00:00
2023-02-09 20:42:38 +00:00
webui . queue ( concurrency_count = args . concurrency_count )
2023-02-10 15:58:56 +00:00
return webui