forked from mrq/ai-voice-cloning
Compare commits
10 Commits
287738a338
...
2c244c49ec
Author | SHA1 | Date | |
---|---|---|---|
|
2c244c49ec | ||
119ac50c58 | |||
da0af4c498 | |||
11a1f6a00e | |||
12c51b6057 | |||
999878d9c6 | |||
14779a5020 | |||
0e3bbc55f8 | |||
788a957f79 | |||
5be14abc21 |
|
@ -1,4 +1,3 @@
|
|||
git+https://github.com/openai/whisper.git
|
||||
more-itertools
|
||||
ffmpeg-python
|
||||
gradio
|
||||
|
|
|
@ -5,10 +5,10 @@ python -m venv venv
|
|||
call .\venv\Scripts\activate.bat
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
|
||||
python -m pip install -r .\dlas\requirements.txt
|
||||
python -m pip install -r .\requirements.txt
|
||||
python -m pip install -r .\tortoise-tts\requirements.txt
|
||||
python -m pip install -e .\tortoise-tts\
|
||||
python -m pip install -r .\requirements.txt
|
||||
python -m pip install -r .\dlas\requirements.txt
|
||||
|
||||
xcopy .\dlas\bitsandbytes_windows\* .\venv\Lib\site-packages\bitsandbytes\. /Y
|
||||
xcopy .\dlas\bitsandbytes_windows\cuda_setup\* .\venv\Lib\site-packages\bitsandbytes\cuda_setup\. /Y
|
||||
|
|
|
@ -9,9 +9,9 @@ python3 -m pip install --upgrade pip # just to be safe
|
|||
# CUDA
|
||||
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu117
|
||||
# install requirements
|
||||
python3 -m pip install -r ./dlas/requirements.txt # instal DLAS requirements
|
||||
python3 -m pip install -r ./requirements.txt # install local requirements
|
||||
python3 -m pip install -r ./tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||
python3 -m pip install -e ./tortoise-tts/ # install TorToiSe
|
||||
python3 -m pip install -r ./requirements.txt # install local requirements
|
||||
python3 -m pip install -r ./dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here
|
||||
|
||||
deactivate
|
|
@ -5,10 +5,10 @@ python -m venv venv
|
|||
call .\venv\Scripts\activate.bat
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install torch torchvision torchaudio torch-directml
|
||||
python -m pip install -r .\dlas\requirements.txt
|
||||
python -m pip install -r .\requirements.txt
|
||||
python -m pip install -r .\tortoise-tts\requirements.txt
|
||||
python -m pip install -e .\tortoise-tts\
|
||||
python -m pip install -r .\requirements.txt
|
||||
python -m pip install -r .\dlas\requirements.txt
|
||||
|
||||
pause
|
||||
deactivate
|
54
setup-guided.sh
Executable file
54
setup-guided.sh
Executable file
|
@ -0,0 +1,54 @@
|
|||
#!/bin/bash
|
||||
|
||||
if ! command -v git &> /dev/null; then
|
||||
if [[ "$(read -e -p 'Could not find git. Continue? [y/N]> '; echo $REPLY)" != [Yy]* ]]; then exit 1; fi
|
||||
else
|
||||
printf "git - ok\n"
|
||||
fi
|
||||
|
||||
# TODO: This could be more precise. e.g. checking for python3, then python, etc.
|
||||
if ! command -v python &> /dev/null; then
|
||||
if [[ "$(read -e -p 'Could not find python. Continue? [y/N]> '; echo $REPLY)" != [Yy]* ]]; then exit 1; fi
|
||||
else
|
||||
python -c 'import sys; sys.stderr.write("Wrong python version.\n") if sys.version_info.major != 3 else sys.stderr.write("Python 3 - ok\n")'
|
||||
fi
|
||||
|
||||
printf "Which GPU brand do you have?\n"
|
||||
gpus=(nvidia amd)
|
||||
|
||||
gpu=""
|
||||
while [ "$gpu" = "" ]; do
|
||||
select gpu in $(printf '%s\n' ${gpus[@]}); do break; done
|
||||
done
|
||||
|
||||
if [ $gpu = "nvidia" ]; then
|
||||
./setup-cuda.sh
|
||||
elif [ $gpu = "amd" ]; then
|
||||
./setup-rocm.sh
|
||||
fi
|
||||
|
||||
source ./venv/bin/activate
|
||||
|
||||
printf "Which Whisper backend would you like to use?\n"
|
||||
whisper_backends=("openai/whisper" "m-bain/whisperx" "lightmare/whispercpp")
|
||||
|
||||
whisper_backend=""
|
||||
while [ "$whisper_backend" = "" ]; do
|
||||
select whisper_backend in $(printf '%s\n' ${whisper_backends[@]}); do break; done
|
||||
done
|
||||
|
||||
if [ $whisper_backend = "openai/whisper" ]; then
|
||||
python -m pip install git+https://github.com/openai/whisper.git
|
||||
elif [ $whisper_backend = "m-bain/whisperx" ]; then
|
||||
python -m pip install git+https://github.com/m-bain/whisperx.git
|
||||
elif [ $whisper_backend = "lightmare/whispercpp" ]; then
|
||||
# This depends on SemVer
|
||||
# Git > v2.18 for `--sort`
|
||||
# Git > v2.4 for `versionsort.suffix`
|
||||
# For older versions:
|
||||
# git ls-remote --refs --tags https://git.ecker.tech/lightmare/whispercpp.py | cut --delimiter='/' --fields=3 | tr '-' '~' | sort --version-sort | tail --lines=1
|
||||
WHISPERCPP_LATEST=$(git -c 'versionsort.suffix=-' ls-remote --exit-code --refs --sort='version:refname' --tags https://git.ecker.tech/lightmare/whispercpp.py '*.*.*' | tail -n 1 | cut --delimiter='/' --fields=3)
|
||||
python -m pip install git+https://git.ecker.tech/lightmare/whispercpp.py@$WHISPERCPP_LATEST
|
||||
fi
|
||||
|
||||
deactivate
|
|
@ -9,10 +9,10 @@ python3 -m pip install --upgrade pip # just to be safe
|
|||
# ROCM
|
||||
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.1.1 # 5.2 does not work for me desu
|
||||
# install requirements
|
||||
python3 -m pip install -r ./dlas/requirements.txt # instal DLAS requirements
|
||||
python3 -m pip install -r ./requirements.txt # install local requirements
|
||||
python3 -m pip install -r ./tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||
python3 -m pip install -e ./tortoise-tts/ # install TorToiSe
|
||||
python3 -m pip install -r ./requirements.txt # install local requirements
|
||||
python3 -m pip install -r ./dlas/requirements.txt # instal DLAS requirements
|
||||
# swap to ROCm version of BitsAndBytes
|
||||
pip3 uninstall -y bitsandbytes
|
||||
./setup-rocm-bnb.sh
|
||||
|
|
138
src/utils.py
138
src/utils.py
|
@ -37,8 +37,11 @@ from tortoise.utils.text import split_and_recombine_text
|
|||
from tortoise.utils.device import get_device_name, set_device_name
|
||||
|
||||
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
||||
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large"]
|
||||
|
||||
WHISPER_MODELS = ["tiny", "base", "small", "medium", "large", "large-v2"]
|
||||
WHISPER_SPECIALIZED_MODELS = ["tiny.en", "base.en", "small.en", "medium.en"]
|
||||
WHISPER_BACKENDS = ["openai/whisper", "lightmare/whispercpp", "m-bain/whisperx"]
|
||||
|
||||
EPOCH_SCHEDULE = [ 9, 18, 25, 33 ]
|
||||
|
||||
args = None
|
||||
|
@ -233,7 +236,7 @@ def generate(
|
|||
if emotion == "Custom":
|
||||
if prompt and prompt.strip() != "":
|
||||
cut_text = f"[{prompt},] {cut_text}"
|
||||
else:
|
||||
elif emotion != "None":
|
||||
cut_text = f"[I am really {emotion.lower()},] {cut_text}"
|
||||
|
||||
progress.msg_prefix = f'[{str(line+1)}/{str(len(texts))}]'
|
||||
|
@ -464,14 +467,21 @@ def update_baseline_for_latents_chunks( voice ):
|
|||
return 1
|
||||
|
||||
files = os.listdir(path)
|
||||
|
||||
total = 0
|
||||
total_duration = 0
|
||||
|
||||
for file in files:
|
||||
if file[-4:] != ".wav":
|
||||
continue
|
||||
|
||||
metadata = torchaudio.info(f'{path}/{file}')
|
||||
duration = metadata.num_channels * metadata.num_frames / metadata.sample_rate
|
||||
total_duration += duration
|
||||
total = total + 1
|
||||
|
||||
if args.autocalculate_voice_chunk_duration_size == 0:
|
||||
return int(total_duration / total) if total > 0 else 1
|
||||
return int(total_duration / args.autocalculate_voice_chunk_duration_size) if total_duration > 0 else 1
|
||||
|
||||
def compute_latents(voice, voice_latents_chunks, progress=gr.Progress(track_tqdm=True)):
|
||||
|
@ -550,6 +560,8 @@ class TrainingState():
|
|||
self.eta = "?"
|
||||
self.eta_hhmmss = "?"
|
||||
|
||||
self.nan_detected = False
|
||||
|
||||
self.last_info_check_at = 0
|
||||
self.statistics = []
|
||||
self.losses = []
|
||||
|
@ -701,13 +713,10 @@ class TrainingState():
|
|||
info_line = line.split("INFO:")[-1]
|
||||
# to-do, actually validate this works, and probably kill training when it's found, the model's dead by this point
|
||||
if ': nan' in info_line:
|
||||
should_return = True
|
||||
|
||||
print("! NAN DETECTED !")
|
||||
self.buffer.append("! NAN DETECTED !")
|
||||
self.nan_detected = True
|
||||
|
||||
# easily rip out our stats...
|
||||
match = re.findall(r'\b([a-z_0-9]+?)\b: +?([0-9]\.[0-9]+?e[+-]\d+|[\d,]+)\b', info_line)
|
||||
match = re.findall(r'\b([a-z_0-9]+?)\b: *?([0-9]\.[0-9]+?e[+-]\d+|[\d,]+)\b', info_line)
|
||||
if match and len(match) > 0:
|
||||
for k, v in match:
|
||||
self.info[k] = float(v.replace(",", ""))
|
||||
|
@ -862,6 +871,8 @@ class TrainingState():
|
|||
self.metrics['loss'] = ", ".join(self.metrics['loss'])
|
||||
|
||||
message = f"[{self.metrics['step']}] [{self.metrics['rate']}] [ETA: {eta_hhmmss}]\n[{self.metrics['loss']}]"
|
||||
if self.nan_detected:
|
||||
message = f"[!NaN DETECTED!] {message}"
|
||||
|
||||
if message:
|
||||
percent = self.it / float(self.its) # self.epoch / float(self.epochs)
|
||||
|
@ -916,12 +927,6 @@ def run_training(config_path, verbose=False, gpus=1, keep_x_past_datasets=0, pro
|
|||
return_code = training_state.process.wait()
|
||||
training_state = None
|
||||
|
||||
def get_training_losses():
|
||||
global training_state
|
||||
if not training_state or not training_state.statistics:
|
||||
return
|
||||
return pd.DataFrame(training_state.statistics)
|
||||
|
||||
def update_training_dataplot(config_path=None):
|
||||
global training_state
|
||||
update = None
|
||||
|
@ -930,12 +935,12 @@ def update_training_dataplot(config_path=None):
|
|||
if config_path:
|
||||
training_state = TrainingState(config_path=config_path, start=False)
|
||||
if training_state.statistics:
|
||||
update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics))
|
||||
update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics), x_lim=[0,training_state.its], x="step", y="value", title="Training Metrics", color="type", tooltip=['step', 'value', 'type'], width=600, height=350,)
|
||||
del training_state
|
||||
training_state = None
|
||||
elif training_state.statistics:
|
||||
training_state.load_losses()
|
||||
update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics))
|
||||
update = gr.LinePlot.update(value=pd.DataFrame(training_state.statistics), x_lim=[0,training_state.its], x="step", y="value", title="Training Metrics", color="type", tooltip=['step', 'value', 'type'], width=600, height=350,)
|
||||
|
||||
return update
|
||||
|
||||
|
@ -965,7 +970,6 @@ def stop_training():
|
|||
try:
|
||||
children = [p.info for p in psutil.process_iter(attrs=['pid', 'name', 'cmdline']) if './src/train.py' in p.info['cmdline']]
|
||||
except Exception as e:
|
||||
print(e)
|
||||
pass
|
||||
|
||||
training_state.process.stdout.close()
|
||||
|
@ -1000,30 +1004,46 @@ def whisper_transcribe( file, language=None ):
|
|||
if not whisper_model:
|
||||
load_whisper_model(language=language)
|
||||
|
||||
if not args.whisper_cpp:
|
||||
if args.whisper_backend == "openai/whisper":
|
||||
if not language:
|
||||
language = None
|
||||
|
||||
return whisper_model.transcribe(file, language=language)
|
||||
|
||||
res = whisper_model.transcribe(file)
|
||||
segments = whisper_model.extract_text_and_timestamps( res )
|
||||
elif args.whisper_backend == "lightmare/whispercpp":
|
||||
res = whisper_model.transcribe(file)
|
||||
segments = whisper_model.extract_text_and_timestamps( res )
|
||||
|
||||
result = {
|
||||
'segments': []
|
||||
}
|
||||
for segment in segments:
|
||||
reparsed = {
|
||||
'start': segment[0] / 100.0,
|
||||
'end': segment[1] / 100.0,
|
||||
'text': segment[2],
|
||||
result = {
|
||||
'segments': []
|
||||
}
|
||||
result['segments'].append(reparsed)
|
||||
for segment in segments:
|
||||
reparsed = {
|
||||
'start': segment[0] / 100.0,
|
||||
'end': segment[1] / 100.0,
|
||||
'text': segment[2],
|
||||
}
|
||||
result['segments'].append(reparsed)
|
||||
|
||||
return result
|
||||
return result
|
||||
|
||||
# credit to https://git.ecker.tech/yqxtqymn for the busywork of getting this added
|
||||
elif args.whisper_backend == "m-bain/whisperx":
|
||||
import whisperx
|
||||
device = "cuda" if get_device_name() == "cuda" else "cpu"
|
||||
result = whisper_model.transcribe(file)
|
||||
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
||||
result_aligned = whisperx.align(result["segments"], model_a, metadata, file, device)
|
||||
|
||||
def prepare_dataset( files, outdir, language=None, progress=None ):
|
||||
for i in range(len(result_aligned['segments'])):
|
||||
del result_aligned['segments'][i]['word-segments']
|
||||
del result_aligned['segments'][i]['char-segments']
|
||||
|
||||
result['segments'] = result_aligned['segments']
|
||||
|
||||
return result
|
||||
|
||||
def prepare_dataset( files, outdir, language=None, skip_existings=False, progress=None ):
|
||||
unload_tts()
|
||||
|
||||
global whisper_model
|
||||
|
@ -1034,9 +1054,30 @@ def prepare_dataset( files, outdir, language=None, progress=None ):
|
|||
|
||||
results = {}
|
||||
transcription = []
|
||||
files = sorted(files)
|
||||
|
||||
previous_list = []
|
||||
parsed_list = []
|
||||
if skip_existings and os.path.exists(f'{outdir}/train.txt'):
|
||||
with open(f'{outdir}/train.txt', 'r', encoding="utf-8") as f:
|
||||
parsed_list = f.readlines()
|
||||
|
||||
for line in parsed_list:
|
||||
match = re.findall(r"^(.+?)_\d+\.wav$", line.split("|")[0])
|
||||
print(match)
|
||||
if match is None or len(match) == 0:
|
||||
continue
|
||||
|
||||
if match[0] not in previous_list:
|
||||
previous_list.append(f'{match[0]}.wav')
|
||||
|
||||
for file in enumerate_progress(files, desc="Iterating through voice files", progress=progress):
|
||||
basename = os.path.basename(file)
|
||||
|
||||
if basename in previous_list:
|
||||
print(f"Skipping already parsed file: {basename}")
|
||||
continue
|
||||
|
||||
result = whisper_transcribe(file, language=language)
|
||||
results[basename] = result
|
||||
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
||||
|
@ -1063,9 +1104,14 @@ def prepare_dataset( files, outdir, language=None, progress=None ):
|
|||
transcription.append(line)
|
||||
with open(f'{outdir}/train.txt', 'a', encoding="utf-8") as f:
|
||||
f.write(f'{line}\n')
|
||||
|
||||
do_gc()
|
||||
|
||||
with open(f'{outdir}/whisper.json', 'w', encoding="utf-8") as f:
|
||||
f.write(json.dumps(results, indent='\t'))
|
||||
|
||||
if len(parsed_list) > 0:
|
||||
transcription = parsed_list + transcription
|
||||
|
||||
joined = '\n'.join(transcription)
|
||||
with open(f'{outdir}/train.txt', 'w', encoding="utf-8") as f:
|
||||
|
@ -1419,13 +1465,13 @@ def setup_args():
|
|||
'prune-nonfinal-outputs': True,
|
||||
'use-bigvgan-vocoder': True,
|
||||
'concurrency-count': 2,
|
||||
'autocalculate-voice-chunk-duration-size': 10,
|
||||
'autocalculate-voice-chunk-duration-size': 0,
|
||||
'output-sample-rate': 44100,
|
||||
'output-volume': 1,
|
||||
|
||||
'autoregressive-model': None,
|
||||
'whisper-backend': 'openai/whisper',
|
||||
'whisper-model': "base",
|
||||
'whisper-cpp': False,
|
||||
|
||||
'training-default-halfp': False,
|
||||
'training-default-bnb': True,
|
||||
|
@ -1463,8 +1509,8 @@ def setup_args():
|
|||
parser.add_argument("--output-volume", type=float, default=default_arguments['output-volume'], help="Adjusts volume of output")
|
||||
|
||||
parser.add_argument("--autoregressive-model", default=default_arguments['autoregressive-model'], help="Specifies which autoregressive model to use for sampling.")
|
||||
parser.add_argument("--whisper-backend", default=default_arguments['whisper-backend'], action='store_true', help="Picks which whisper backend to use (openai/whisper, lightmare/whispercpp, m-bain/whisperx)")
|
||||
parser.add_argument("--whisper-model", default=default_arguments['whisper-model'], help="Specifies which whisper model to use for transcription.")
|
||||
parser.add_argument("--whisper-cpp", default=default_arguments['whisper-cpp'], action='store_true', help="Leverages lightmare/whispercpp for transcription")
|
||||
|
||||
parser.add_argument("--training-default-halfp", action='store_true', default=default_arguments['training-default-halfp'], help="Training default: halfp")
|
||||
parser.add_argument("--training-default-bnb", action='store_true', default=default_arguments['training-default-bnb'], help="Training default: bnb")
|
||||
|
@ -1498,7 +1544,7 @@ def setup_args():
|
|||
|
||||
return args
|
||||
|
||||
def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, use_bigvgan_vocoder, device_override, sample_batch_size, concurrency_count, autocalculate_voice_chunk_duration_size, output_volume, autoregressive_model, whisper_model, whisper_cpp, training_default_halfp, training_default_bnb ):
|
||||
def update_args( listen, share, check_for_updates, models_from_local_only, low_vram, embed_output_metadata, latents_lean_and_mean, voice_fixer, voice_fixer_use_cuda, force_cpu_for_conditioning_latents, defer_tts_load, prune_nonfinal_outputs, use_bigvgan_vocoder, device_override, sample_batch_size, concurrency_count, autocalculate_voice_chunk_duration_size, output_volume, autoregressive_model, whisper_backend, whisper_model, training_default_halfp, training_default_bnb ):
|
||||
global args
|
||||
|
||||
args.listen = listen
|
||||
|
@ -1522,8 +1568,8 @@ def update_args( listen, share, check_for_updates, models_from_local_only, low_v
|
|||
args.output_volume = output_volume
|
||||
|
||||
args.autoregressive_model = autoregressive_model
|
||||
args.whisper_backend = whisper_backend
|
||||
args.whisper_model = whisper_model
|
||||
args.whisper_cpp = whisper_cpp
|
||||
|
||||
args.training_default_halfp = training_default_halfp
|
||||
args.training_default_bnb = training_default_bnb
|
||||
|
@ -1554,8 +1600,8 @@ def save_args_settings():
|
|||
'output-volume': args.output_volume,
|
||||
|
||||
'autoregressive-model': args.autoregressive_model,
|
||||
'whisper-backend': args.whisper_backend,
|
||||
'whisper-model': args.whisper_model,
|
||||
'whisper-cpp': args.whisper_cpp,
|
||||
|
||||
'training-default-halfp': args.training_default_halfp,
|
||||
'training-default-bnb': args.training_default_bnb,
|
||||
|
@ -1760,6 +1806,12 @@ def unload_voicefixer():
|
|||
|
||||
def load_whisper_model(language=None, model_name=None, progress=None):
|
||||
global whisper_model
|
||||
|
||||
if args.whisper_backend not in WHISPER_BACKENDS:
|
||||
raise Exception(f"unavailable backend: {args.whisper_backend}")
|
||||
|
||||
if args.whisper_backend != "m-bain/whisperx" and model_name == "large-v2":
|
||||
raise Exception("large-v2 is only available for m-bain/whisperx backend")
|
||||
|
||||
if not model_name:
|
||||
model_name = args.whisper_model
|
||||
|
@ -1773,16 +1825,20 @@ def load_whisper_model(language=None, model_name=None, progress=None):
|
|||
|
||||
notify_progress(f"Loading Whisper model: {model_name}", progress)
|
||||
|
||||
if args.whisper_cpp:
|
||||
if args.whisper_backend == "openai/whisper":
|
||||
import whisper
|
||||
whisper_model = whisper.load_model(model_name)
|
||||
elif args.whisper_backend == "lightmare/whispercpp":
|
||||
from whispercpp import Whisper
|
||||
if not language:
|
||||
language = 'auto'
|
||||
|
||||
b_lang = language.encode('ascii')
|
||||
whisper_model = Whisper(model_name, models_dir='./models/', language=b_lang)
|
||||
else:
|
||||
import whisper
|
||||
whisper_model = whisper.load_model(model_name)
|
||||
elif args.whisper_backend == "m-bain/whisperx":
|
||||
import whisperx
|
||||
device = "cuda" if get_device_name() == "cuda" else "cpu"
|
||||
whisper_model = whisperx.load_model(model_name, device)
|
||||
|
||||
print("Loaded Whisper model")
|
||||
|
||||
|
@ -1794,4 +1850,4 @@ def unload_whisper():
|
|||
whisper_model = None
|
||||
print("Unloaded Whisper")
|
||||
|
||||
do_gc()
|
||||
do_gc()
|
59
src/webui.py
59
src/webui.py
|
@ -180,13 +180,13 @@ def read_generate_settings_proxy(file, saveAs='.temp'):
|
|||
|
||||
return (
|
||||
gr.update(value=j, visible=j is not None),
|
||||
gr.update(visible=j is not None),
|
||||
gr.update(value=latents, visible=latents is not None),
|
||||
None if j is None else j['voice']
|
||||
None if j is None else j['voice'],
|
||||
gr.update(visible=j is not None),
|
||||
)
|
||||
|
||||
def prepare_dataset_proxy( voice, language, progress=gr.Progress(track_tqdm=True) ):
|
||||
return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, progress=progress )
|
||||
def prepare_dataset_proxy( voice, language, skip_existings, progress=gr.Progress(track_tqdm=True) ):
|
||||
return prepare_dataset( get_voices(load_latents=False)[voice], outdir=f"./training/{voice}/", language=language, skip_existings=skip_existings, progress=progress )
|
||||
|
||||
def optimize_training_settings_proxy( *args, **kwargs ):
|
||||
tup = optimize_training_settings(*args, **kwargs)
|
||||
|
@ -378,15 +378,15 @@ def setup_gradio():
|
|||
with gr.Tab("Generate"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
text = gr.Textbox(lines=4, label="Prompt")
|
||||
text = gr.Textbox(lines=4, label="Input Prompt")
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
delimiter = gr.Textbox(lines=1, label="Line Delimiter", placeholder="\\n")
|
||||
|
||||
emotion = gr.Radio( ["Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom"], value="Custom", label="Emotion", type="value", interactive=True )
|
||||
prompt = gr.Textbox(lines=1, label="Custom Emotion + Prompt (if selected)")
|
||||
emotion = gr.Radio( ["Happy", "Sad", "Angry", "Disgusted", "Arrogant", "Custom", "None"], value="None", label="Emotion", type="value", interactive=True )
|
||||
prompt = gr.Textbox(lines=1, label="Custom Emotion")
|
||||
voice = gr.Dropdown(choices=voice_list_with_defaults, label="Voice", type="value", value=voice_list_with_defaults[0]) # it'd be very cash money if gradio was able to default to the first value in the list without this shit
|
||||
mic_audio = gr.Audio( label="Microphone Source", source="microphone", type="filepath" )
|
||||
mic_audio = gr.Audio( label="Microphone Source", source="microphone", type="filepath", visible=False )
|
||||
voice_latents_chunks = gr.Slider(label="Voice Chunks", minimum=1, maximum=128, value=1, step=1)
|
||||
with gr.Row():
|
||||
refresh_voices = gr.Button(value="Refresh Voice List")
|
||||
|
@ -397,6 +397,11 @@ def setup_gradio():
|
|||
inputs=voice,
|
||||
outputs=voice_latents_chunks
|
||||
)
|
||||
voice.change(
|
||||
fn=lambda value: gr.update(visible=value == "microphone"),
|
||||
inputs=voice,
|
||||
outputs=mic_audio,
|
||||
)
|
||||
with gr.Column():
|
||||
candidates = gr.Slider(value=1, minimum=1, maximum=6, step=1, label="Candidates")
|
||||
seed = gr.Number(value=0, precision=0, label="Seed")
|
||||
|
@ -406,16 +411,17 @@ def setup_gradio():
|
|||
diffusion_iterations = gr.Slider(value=128, minimum=0, maximum=512, step=1, label="Iterations")
|
||||
|
||||
temperature = gr.Slider(value=0.2, minimum=0, maximum=1, step=0.1, label="Temperature")
|
||||
breathing_room = gr.Slider(value=8, minimum=1, maximum=32, step=1, label="Pause Size")
|
||||
diffusion_sampler = gr.Radio(
|
||||
["P", "DDIM"], # + ["K_Euler_A", "DPM++2M"],
|
||||
value="P", label="Diffusion Samplers", type="value" )
|
||||
show_experimental_settings = gr.Checkbox(label="Show Experimental Settings")
|
||||
reset_generation_settings_button = gr.Button(value="Reset to Default")
|
||||
with gr.Column(visible=False) as col:
|
||||
experimental_column = col
|
||||
|
||||
experimental_checkboxes = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=["Conditioning-Free"], label="Experimental Flags")
|
||||
breathing_room = gr.Slider(value=8, minimum=1, maximum=32, step=1, label="Pause Size")
|
||||
diffusion_sampler = gr.Radio(
|
||||
["P", "DDIM"], # + ["K_Euler_A", "DPM++2M"],
|
||||
value="DDIM", label="Diffusion Samplers", type="value"
|
||||
)
|
||||
cvvp_weight = gr.Slider(value=0, minimum=0, maximum=1, label="CVVP Weight")
|
||||
top_p = gr.Slider(value=0.8, minimum=0, maximum=1, label="Top P")
|
||||
diffusion_temperature = gr.Slider(value=1.0, minimum=0, maximum=1, label="Diffusion Temperature")
|
||||
|
@ -460,17 +466,20 @@ def setup_gradio():
|
|||
audio_in = gr.Files(type="file", label="Audio Input", file_types=["audio"])
|
||||
import_voice_name = gr.Textbox(label="Voice Name")
|
||||
import_voice_button = gr.Button(value="Import Voice")
|
||||
with gr.Column():
|
||||
metadata_out = gr.JSON(label="Audio Metadata", visible=False)
|
||||
copy_button = gr.Button(value="Copy Settings", visible=False)
|
||||
latents_out = gr.File(type="binary", label="Voice Latents", visible=False)
|
||||
with gr.Column(visible=False) as col:
|
||||
utilities_metadata_column = col
|
||||
|
||||
metadata_out = gr.JSON(label="Audio Metadata")
|
||||
copy_button = gr.Button(value="Copy Settings")
|
||||
latents_out = gr.File(type="binary", label="Voice Latents")
|
||||
with gr.Tab("Training"):
|
||||
with gr.Tab("Prepare Dataset"):
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
dataset_settings = [
|
||||
gr.Dropdown( choices=voice_list, label="Dataset Source", type="value", value=voice_list[0] if len(voice_list) > 0 else "" ),
|
||||
gr.Textbox(label="Language", value="en")
|
||||
gr.Textbox(label="Language", value="en"),
|
||||
gr.Checkbox(label="Skip Already Transcribed", value=False)
|
||||
]
|
||||
prepare_dataset_button = gr.Button(value="Prepare")
|
||||
with gr.Column():
|
||||
|
@ -581,10 +590,10 @@ def setup_gradio():
|
|||
|
||||
autoregressive_model_dropdown = gr.Dropdown(choices=autoregressive_models, label="Autoregressive Model", value=args.autoregressive_model if args.autoregressive_model else autoregressive_models[0])
|
||||
|
||||
whisper_backend = gr.Dropdown(WHISPER_BACKENDS, label="Whisper Backends", value=args.whisper_backend)
|
||||
whisper_model_dropdown = gr.Dropdown(WHISPER_MODELS, label="Whisper Model", value=args.whisper_model)
|
||||
use_whisper_cpp = gr.Checkbox(label="Use Whisper.cpp", value=args.whisper_cpp)
|
||||
|
||||
exec_inputs = exec_inputs + [ autoregressive_model_dropdown, whisper_model_dropdown, use_whisper_cpp, training_halfp, training_bnb ]
|
||||
exec_inputs = exec_inputs + [ autoregressive_model_dropdown, whisper_backend, whisper_model_dropdown, training_halfp, training_bnb ]
|
||||
|
||||
with gr.Row():
|
||||
autoregressive_models_update_button = gr.Button(value="Refresh Model List")
|
||||
|
@ -662,9 +671,9 @@ def setup_gradio():
|
|||
inputs=audio_in,
|
||||
outputs=[
|
||||
metadata_out,
|
||||
copy_button,
|
||||
latents_out,
|
||||
import_voice_name
|
||||
import_voice_name,
|
||||
utilities_metadata_column,
|
||||
]
|
||||
)
|
||||
|
||||
|
@ -697,9 +706,10 @@ def setup_gradio():
|
|||
outputs=voice,
|
||||
)
|
||||
|
||||
prompt.change(fn=lambda value: gr.update(value="Custom"),
|
||||
inputs=prompt,
|
||||
outputs=emotion
|
||||
emotion.change(
|
||||
fn=lambda value: gr.update(visible=value == "Custom"),
|
||||
inputs=emotion,
|
||||
outputs=prompt
|
||||
)
|
||||
mic_audio.change(fn=lambda value: gr.update(value="microphone"),
|
||||
inputs=mic_audio,
|
||||
|
@ -723,6 +733,7 @@ def setup_gradio():
|
|||
submit_event = submit.click(run_generation,
|
||||
inputs=input_settings,
|
||||
outputs=[output_audio, source_sample, candidates_list, generation_results],
|
||||
api_name="generate",
|
||||
)
|
||||
|
||||
|
||||
|
|
2
start.sh
2
start.sh
|
@ -1,4 +1,6 @@
|
|||
#!/bin/bash
|
||||
if [ ! -d "venv" ]; then ./setup-guided.sh; fi
|
||||
|
||||
source ./venv/bin/activate
|
||||
python3 ./src/main.py "$@"
|
||||
deactivate
|
||||
|
|
|
@ -6,10 +6,10 @@ python -m venv venv
|
|||
call .\venv\Scripts\activate.bat
|
||||
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -U -r .\dlas\requirements.txt
|
||||
python -m pip install -U -r .\requirements.txt
|
||||
python -m pip install -U -r .\tortoise-tts\requirements.txt
|
||||
python -m pip install -U -e .\tortoise-tts
|
||||
python -m pip install -U -r .\requirements.txt
|
||||
python -m pip install -U -r .\dlas\requirements.txt
|
||||
|
||||
pause
|
||||
deactivate
|
|
@ -9,9 +9,9 @@ python3 -m venv venv
|
|||
source ./venv/bin/activate
|
||||
|
||||
python3 -m pip install --upgrade pip
|
||||
python3 -m pip install -r ./dlas/requirements.txt
|
||||
python3 -m pip install -r ./requirements.txt
|
||||
python3 -m pip install -r ./tortoise-tts/requirements.txt
|
||||
python3 -m pip install -e ./tortoise-tts
|
||||
python3 -m pip install -r ./requirements.txt
|
||||
python3 -m pip install -r ./dlas/requirements.txt
|
||||
|
||||
deactivate
|
|
@ -3,5 +3,6 @@ git pull
|
|||
git submodule update --remote
|
||||
|
||||
if python -m pip show whispercpp &>/dev/null; then python -m pip install -U git+https://git.ecker.tech/lightmare/whispercpp.py; fi
|
||||
if python -m pip show whisperx &>/dev/null; then python -m pip install -U git+https://github.com/m-bain/whisperx.git; fi
|
||||
|
||||
deactivate
|
Loading…
Reference in New Issue
Block a user