forked from camenduru/ai-voice-cloning
:)
This commit is contained in:
parent
92a05d3c4c
commit
54036fd780
9
models/.template.valle.yaml
Executable file
9
models/.template.valle.yaml
Executable file
|
@ -0,0 +1,9 @@
|
||||||
|
data_dirs: [./training/${voice}/valle/]
|
||||||
|
spkr_name_getter: "lambda p: p.parts[-3]"
|
||||||
|
|
||||||
|
model: ${model_name}
|
||||||
|
batch_size: ${batch_size}
|
||||||
|
eval_batch_size: ${validation_batch_size}
|
||||||
|
eval_every: ${validation_rate}
|
||||||
|
|
||||||
|
sampling_temperature: 1.0
|
163
src/utils.py
163
src/utils.py
|
@ -20,6 +20,8 @@ import subprocess
|
||||||
import psutil
|
import psutil
|
||||||
import yaml
|
import yaml
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import io
|
||||||
|
import gzip
|
||||||
|
|
||||||
import tqdm
|
import tqdm
|
||||||
import torch
|
import torch
|
||||||
|
@ -45,6 +47,7 @@ WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
||||||
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
||||||
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"]
|
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp"]
|
||||||
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
VOCODERS = ['univnet', 'bigvgan_base_24khz_100band', 'bigvgan_24khz_100band']
|
||||||
|
TTSES = ['tortoise'] # + ['vall-e']
|
||||||
|
|
||||||
GENERATE_SETTINGS_ARGS = None
|
GENERATE_SETTINGS_ARGS = None
|
||||||
|
|
||||||
|
@ -56,6 +59,16 @@ RESAMPLERS = {}
|
||||||
MIN_TRAINING_DURATION = 0.6
|
MIN_TRAINING_DURATION = 0.6
|
||||||
MAX_TRAINING_DURATION = 11.6097505669
|
MAX_TRAINING_DURATION = 11.6097505669
|
||||||
|
|
||||||
|
VALLE_ENABLED = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vall_e.emb.qnt import encode as quantize
|
||||||
|
from vall_e.emb.g2p import encode as phonemize
|
||||||
|
|
||||||
|
VALLE_ENABLED = True
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
args = None
|
args = None
|
||||||
tts = None
|
tts = None
|
||||||
tts_loading = False
|
tts_loading = False
|
||||||
|
@ -1195,7 +1208,7 @@ def slice_dataset( voice, trim_silence=True, start_offset=0, end_offset=0, resul
|
||||||
messages.append(f"Sliced segments: {files} => {segments}.")
|
messages.append(f"Sliced segments: {files} => {segments}.")
|
||||||
return "\n".join(messages)
|
return "\n".join(messages)
|
||||||
|
|
||||||
def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=False ):
|
def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=True ):
|
||||||
indir = f'./training/{voice}/'
|
indir = f'./training/{voice}/'
|
||||||
infile = f'{indir}/whisper.json'
|
infile = f'{indir}/whisper.json'
|
||||||
messages = []
|
messages = []
|
||||||
|
@ -1273,6 +1286,8 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
|
||||||
continue
|
continue
|
||||||
|
|
||||||
waveform, sample_rate = torchaudio.load(path)
|
waveform, sample_rate = torchaudio.load(path)
|
||||||
|
num_channels, num_frames = waveform.shape
|
||||||
|
duration = num_frames / sample_rate
|
||||||
|
|
||||||
error = validate_waveform( waveform, sample_rate )
|
error = validate_waveform( waveform, sample_rate )
|
||||||
if error:
|
if error:
|
||||||
|
@ -1281,21 +1296,43 @@ def prepare_dataset( voice, use_segments, text_length, audio_length, normalize=F
|
||||||
messages.append(message)
|
messages.append(message)
|
||||||
errored += 1
|
errored += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
culled = len(text) < text_length
|
culled = len(text) < text_length
|
||||||
if not culled and audio_length > 0:
|
if not culled and audio_length > 0:
|
||||||
num_channels, num_frames = waveform.shape
|
|
||||||
duration = num_frames / sample_rate
|
|
||||||
culled = duration < audio_length
|
culled = duration < audio_length
|
||||||
|
|
||||||
# for when i add in a little treat ;), as it requires normalized text
|
# for when i add in a little treat ;), as it requires normalized text
|
||||||
if normalize and length(normalized_text) < 200:
|
if normalize and len(normalized_text) < 200:
|
||||||
line = f'audio/{file}|{text}|{normalized_text}'
|
line = f'audio/{file}|{text}|{normalized_text}'
|
||||||
else:
|
else:
|
||||||
line = f'audio/{file}|{text}'
|
line = f'audio/{file}|{text}'
|
||||||
|
|
||||||
lines['training' if not culled else 'validation'].append(line)
|
lines['training' if not culled else 'validation'].append(line)
|
||||||
|
|
||||||
|
if culled or not VALLE_ENABLED:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# VALL-E dataset
|
||||||
|
os.makedirs(f'{indir}/valle/', exist_ok=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from vall_e.emb.qnt import encode as quantize
|
||||||
|
from vall_e.emb.g2p import encode as phonemize
|
||||||
|
|
||||||
|
if waveform.shape[0] == 2:
|
||||||
|
waveform = wav[:1]
|
||||||
|
|
||||||
|
quantized = quantize( waveform, sample_rate ).cpu()
|
||||||
|
torch.save(quantized, f'{indir}/valle/{file.replace(".wav",".qnt.pt")}')
|
||||||
|
|
||||||
|
phonemes = phonemize(normalized_text)
|
||||||
|
open(f'{indir}/valle/{file.replace(".wav",".phn.txt")}', 'w', encoding='utf-8').write(" ".join(phonemes))
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
pass
|
||||||
|
|
||||||
training_joined = "\n".join(lines['training'])
|
training_joined = "\n".join(lines['training'])
|
||||||
validation_joined = "\n".join(lines['validation'])
|
validation_joined = "\n".join(lines['validation'])
|
||||||
|
|
||||||
|
@ -1538,21 +1575,27 @@ def save_training_settings( **kwargs ):
|
||||||
settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'"
|
settings['source_model'] = f"pretrain_model_gpt: '{settings['source_model']}'"
|
||||||
settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'"
|
settings['resume_state'] = f"# resume_state: '{settings['resume_state']}'"
|
||||||
|
|
||||||
with open(f'./models/.template.yaml', 'r', encoding="utf-8") as f:
|
def use_template(template, out):
|
||||||
yaml = f.read()
|
with open(template, 'r', encoding="utf-8") as f:
|
||||||
|
yaml = f.read()
|
||||||
|
|
||||||
# i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals
|
# i could just load and edit the YAML directly, but this is easier, as I don't need to bother with path traversals
|
||||||
for k in settings:
|
for k in settings:
|
||||||
if settings[k] is None:
|
if settings[k] is None:
|
||||||
continue
|
continue
|
||||||
yaml = yaml.replace(f"${{{k}}}", str(settings[k]))
|
yaml = yaml.replace(f"${{{k}}}", str(settings[k]))
|
||||||
|
|
||||||
outyaml = f'./training/{settings["voice"]}/train.yaml'
|
with open(out, 'w', encoding="utf-8") as f:
|
||||||
with open(outyaml, 'w', encoding="utf-8") as f:
|
f.write(yaml)
|
||||||
f.write(yaml)
|
|
||||||
|
|
||||||
|
use_template(f'./models/.template.dlas.yaml', f'./training/{settings["voice"]}/train.yaml')
|
||||||
|
|
||||||
messages.append(f"Saved training output to: {outyaml}")
|
settings['model_name'] = "ar"
|
||||||
|
use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/ar.yaml')
|
||||||
|
settings['model_name'] = "nar"
|
||||||
|
use_template(f'./models/.template.valle.yaml', f'./training/{settings["voice"]}/nar.yaml')
|
||||||
|
|
||||||
|
messages.append(f"Saved training output")
|
||||||
return settings, messages
|
return settings, messages
|
||||||
|
|
||||||
def import_voices(files, saveAs=None, progress=None):
|
def import_voices(files, saveAs=None, progress=None):
|
||||||
|
@ -1743,17 +1786,22 @@ def setup_args():
|
||||||
'latents-lean-and-mean': True,
|
'latents-lean-and-mean': True,
|
||||||
'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
|
'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
|
||||||
'voice-fixer-use-cuda': True,
|
'voice-fixer-use-cuda': True,
|
||||||
|
|
||||||
'force-cpu-for-conditioning-latents': False,
|
'force-cpu-for-conditioning-latents': False,
|
||||||
'defer-tts-load': False,
|
'defer-tts-load': False,
|
||||||
'device-override': None,
|
'device-override': None,
|
||||||
'prune-nonfinal-outputs': True,
|
'prune-nonfinal-outputs': True,
|
||||||
'vocoder-model': VOCODERS[-1],
|
|
||||||
'concurrency-count': 2,
|
'concurrency-count': 2,
|
||||||
'autocalculate-voice-chunk-duration-size': 0,
|
'autocalculate-voice-chunk-duration-size': 10,
|
||||||
|
|
||||||
'output-sample-rate': 44100,
|
'output-sample-rate': 44100,
|
||||||
'output-volume': 1,
|
'output-volume': 1,
|
||||||
|
|
||||||
|
'tts-backend': TTSES[0],
|
||||||
|
|
||||||
'autoregressive-model': None,
|
'autoregressive-model': None,
|
||||||
|
'vocoder-model': VOCODERS[-1],
|
||||||
|
|
||||||
'whisper-backend': 'openai/whisper',
|
'whisper-backend': 'openai/whisper',
|
||||||
'whisper-model': "base",
|
'whisper-model': "base",
|
||||||
|
|
||||||
|
@ -1792,6 +1840,7 @@ def setup_args():
|
||||||
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
parser.add_argument("--output-sample-rate", type=int, default=default_arguments['output-sample-rate'], help="Sample rate to resample the output to (from 24KHz)")
|
||||||
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
||||||
|
|
||||||
|
parser.add_argument("--tts-backend", default=default_arguments['tts-backend'], help="Specifies which TTS backend to use.")
|
||||||
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
||||||
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp)")
|
||||||
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
||||||
|
@ -1828,10 +1877,48 @@ def setup_args():
|
||||||
|
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
def get_default_settings( hypenated=True ):
|
||||||
|
settings = {
|
||||||
|
'listen': None if not args.listen else args.listen,
|
||||||
|
'share': args.share,
|
||||||
|
'low-vram':args.low_vram,
|
||||||
|
'check-for-updates':args.check_for_updates,
|
||||||
|
'models-from-local-only':args.models_from_local_only,
|
||||||
|
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
||||||
|
'defer-tts-load': args.defer_tts_load,
|
||||||
|
'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
|
||||||
|
'device-override': args.device_override,
|
||||||
|
'sample-batch-size': args.sample_batch_size,
|
||||||
|
'embed-output-metadata': args.embed_output_metadata,
|
||||||
|
'latents-lean-and-mean': args.latents_lean_and_mean,
|
||||||
|
'voice-fixer': args.voice_fixer,
|
||||||
|
'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
|
||||||
|
'concurrency-count': args.concurrency_count,
|
||||||
|
'output-sample-rate': args.output_sample_rate,
|
||||||
|
'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size,
|
||||||
|
'output-volume': args.output_volume,
|
||||||
|
|
||||||
|
'tts-backend': args.tts_backend,
|
||||||
|
|
||||||
|
'autoregressive-model': args.autoregressive_model,
|
||||||
|
'vocoder-model': args.vocoder_model,
|
||||||
|
|
||||||
|
'whisper-backend': args.whisper_backend,
|
||||||
|
'whisper-model': args.whisper_model,
|
||||||
|
|
||||||
|
'training-default-halfp': args.training_default_halfp,
|
||||||
|
'training-default-bnb': args.training_default_bnb,
|
||||||
|
}
|
||||||
|
|
||||||
|
res = {}
|
||||||
|
for k in settings:
|
||||||
|
res[k.replace("-", "_") if not hypenated else k] = settings[k]
|
||||||
|
return res
|
||||||
|
|
||||||
def update_args( **kwargs ):
|
def update_args( **kwargs ):
|
||||||
global args
|
global args
|
||||||
|
|
||||||
settings = {}
|
settings = get_default_settings(hypenated=False)
|
||||||
settings.update(kwargs)
|
settings.update(kwargs)
|
||||||
|
|
||||||
args.listen = settings['listen']
|
args.listen = settings['listen']
|
||||||
|
@ -1853,8 +1940,10 @@ def update_args( **kwargs ):
|
||||||
args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
|
args.autocalculate_voice_chunk_duration_size = settings['autocalculate_voice_chunk_duration_size']
|
||||||
args.output_volume = settings['output_volume']
|
args.output_volume = settings['output_volume']
|
||||||
|
|
||||||
|
args.tts_backend = settings['tts_backend']
|
||||||
args.autoregressive_model = settings['autoregressive_model']
|
args.autoregressive_model = settings['autoregressive_model']
|
||||||
args.vocoder_model = settings['vocoder_model']
|
args.vocoder_model = settings['vocoder_model']
|
||||||
|
|
||||||
args.whisper_backend = settings['whisper_backend']
|
args.whisper_backend = settings['whisper_backend']
|
||||||
args.whisper_model = settings['whisper_model']
|
args.whisper_model = settings['whisper_model']
|
||||||
|
|
||||||
|
@ -1865,34 +1954,7 @@ def update_args( **kwargs ):
|
||||||
|
|
||||||
def save_args_settings():
|
def save_args_settings():
|
||||||
global args
|
global args
|
||||||
settings = {
|
settings = get_default_settings()
|
||||||
'listen': None if not args.listen else args.listen,
|
|
||||||
'share': args.share,
|
|
||||||
'low-vram':args.low_vram,
|
|
||||||
'check-for-updates':args.check_for_updates,
|
|
||||||
'models-from-local-only':args.models_from_local_only,
|
|
||||||
'force-cpu-for-conditioning-latents': args.force_cpu_for_conditioning_latents,
|
|
||||||
'defer-tts-load': args.defer_tts_load,
|
|
||||||
'prune-nonfinal-outputs': args.prune_nonfinal_outputs,
|
|
||||||
'device-override': args.device_override,
|
|
||||||
'sample-batch-size': args.sample_batch_size,
|
|
||||||
'embed-output-metadata': args.embed_output_metadata,
|
|
||||||
'latents-lean-and-mean': args.latents_lean_and_mean,
|
|
||||||
'voice-fixer': args.voice_fixer,
|
|
||||||
'voice-fixer-use-cuda': args.voice_fixer_use_cuda,
|
|
||||||
'concurrency-count': args.concurrency_count,
|
|
||||||
'output-sample-rate': args.output_sample_rate,
|
|
||||||
'autocalculate-voice-chunk-duration-size': args.autocalculate_voice_chunk_duration_size,
|
|
||||||
'output-volume': args.output_volume,
|
|
||||||
|
|
||||||
'autoregressive-model': args.autoregressive_model,
|
|
||||||
'vocoder-model': args.vocoder_model,
|
|
||||||
'whisper-backend': args.whisper_backend,
|
|
||||||
'whisper-model': args.whisper_model,
|
|
||||||
|
|
||||||
'training-default-halfp': args.training_default_halfp,
|
|
||||||
'training-default-bnb': args.training_default_bnb,
|
|
||||||
}
|
|
||||||
|
|
||||||
os.makedirs('./config/', exist_ok=True)
|
os.makedirs('./config/', exist_ok=True)
|
||||||
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
||||||
|
@ -2009,18 +2071,13 @@ def load_tts( restart=False, autoregressive_model=None ):
|
||||||
if autoregressive_model == "auto":
|
if autoregressive_model == "auto":
|
||||||
autoregressive_model = deduce_autoregressive_model()
|
autoregressive_model = deduce_autoregressive_model()
|
||||||
|
|
||||||
print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
|
|
||||||
|
|
||||||
if get_device_name() == "cpu":
|
if get_device_name() == "cpu":
|
||||||
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
|
print("!!!! WARNING !!!! No GPU available in PyTorch. You may need to reinstall PyTorch.")
|
||||||
|
|
||||||
tts_loading = True
|
tts_loading = True
|
||||||
try:
|
print(f"Loading TorToiSe... (AR: {autoregressive_model}, vocoder: {args.vocoder_model})")
|
||||||
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model)
|
tts = TextToSpeech(minor_optimizations=not args.low_vram, autoregressive_model_path=autoregressive_model, vocoder_model=args.vocoder_model)
|
||||||
except Exception as e:
|
|
||||||
tts = TextToSpeech(minor_optimizations=not args.low_vram)
|
|
||||||
load_autoregressive_model(autoregressive_model)
|
|
||||||
|
|
||||||
tts_loading = False
|
tts_loading = False
|
||||||
|
|
||||||
get_model_path('dvae.pth')
|
get_model_path('dvae.pth')
|
||||||
|
|
|
@ -548,11 +548,11 @@ def setup_gradio():
|
||||||
EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
|
EXEC_SETTINGS['autocalculate_voice_chunk_duration_size'] = gr.Number(label="Auto-Calculate Voice Chunk Duration (in seconds)", precision=0, value=args.autocalculate_voice_chunk_duration_size)
|
||||||
EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
|
EXEC_SETTINGS['output_volume'] = gr.Slider(label="Output Volume", minimum=0, maximum=2, value=args.output_volume)
|
||||||
|
|
||||||
|
# EXEC_SETTINGS['tts_backend'] = gr.Dropdown(TTSES, label="TTS Backend", value=args.tts_backend if args.tts_backend else TTSES[0])
|
||||||
|
|
||||||
EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
EXEC_SETTINGS['autoregressive_model'] = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
||||||
|
|
||||||
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
EXEC_SETTINGS['vocoder_model'] = gr.Dropdown(VOCODERS, label="Vocoder", value=args.vocoder_model if args.vocoder_model else VOCODERS[-1])
|
||||||
|
|
||||||
|
|
||||||
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
EXEC_SETTINGS['training_default_halfp'] = TRAINING_SETTINGS['half_p']
|
||||||
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
EXEC_SETTINGS['training_default_bnb'] = TRAINING_SETTINGS['bitsandbytes']
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user