small fixes

This commit is contained in:
mrq 2023-02-17 20:10:27 +00:00
parent f708909687
commit ad4adc960f
2 changed files with 15 additions and 8 deletions

View File

@ -12,4 +12,4 @@ This is not endorsed by [neonbjb](https://github.com/neonbjb/). I do not expect
## Documentation ## Documentation
Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for documentation. Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for the documentation, including how to install, prepare voices for, and use the software.

View File

@ -55,7 +55,7 @@ def setup_args():
'sample-batch-size': None, 'sample-batch-size': None,
'embed-output-metadata': True, 'embed-output-metadata': True,
'latents-lean-and-mean': True, 'latents-lean-and-mean': True,
'voice-fixer': True, 'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
'voice-fixer-use-cuda': True, 'voice-fixer-use-cuda': True,
'force-cpu-for-conditioning-latents': False, 'force-cpu-for-conditioning-latents': False,
'device-override': None, 'device-override': None,
@ -167,7 +167,7 @@ def generate(
progress(0, desc="Loading voice...") progress(0, desc="Loading voice...")
voice_samples, conditioning_latents = load_voice(voice) voice_samples, conditioning_latents = load_voice(voice)
if voice_samples is not None: if voice_samples is not None and len(voice_samples) > 0:
sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu() sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu()
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents) conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
@ -374,7 +374,7 @@ def generate(
with open(f'{outdir}/{voice}_{name}.json', 'w', encoding="utf-8") as f: with open(f'{outdir}/{voice}_{name}.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(info, indent='\t') ) f.write(json.dumps(info, indent='\t') )
if args.voice_fixer and voicefixer: if args.voice_fixer and voicefixer is not None:
fixed_output_voices = [] fixed_output_voices = []
for path in progress.tqdm(output_voices, desc="Running voicefix..."): for path in progress.tqdm(output_voices, desc="Running voicefix..."):
fixed = path.replace(".wav", "_fixed.wav") fixed = path.replace(".wav", "_fixed.wav")
@ -409,6 +409,7 @@ def generate(
if 'latents' in info: if 'latents' in info:
del info['latents'] del info['latents']
os.makedirs('./config/', exist_ok=True)
with open(f'./config/generate.json', 'w', encoding="utf-8") as f: with open(f'./config/generate.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(info, indent='\t') ) f.write(json.dumps(info, indent='\t') )
@ -422,13 +423,18 @@ def generate(
stats, stats,
) )
import subprocess
def run_training(config_path): def run_training(config_path):
print("Unloading TTS to save VRAM.")
global tts global tts
del tts del tts
tts = None tts = None
import subprocess cmd = ["python", "./src/train.py", "-opt", config_path]
subprocess.run(["python", "./src/train.py", "-opt", config_path], env=os.environ.copy(), shell=True, stdout=subprocess.PIPE)
print("Spawning process: ", " ".join(cmd))
subprocess.run(cmd, env=os.environ.copy(), shell=True, stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
""" """
from train import train from train import train
train(config) train(config)
@ -501,7 +507,7 @@ def prepare_dataset( files, outdir, language=None ):
for file in files: for file in files:
print(f"Transcribing file: {file}") print(f"Transcribing file: {file}")
result = whisper_model.transcribe(file, language=language) result = whisper_model.transcribe(file, language=language if language else "English")
results[os.path.basename(file)] = result results[os.path.basename(file)] = result
print(f"Transcribed file: {file}, {len(result['segments'])} found.") print(f"Transcribed file: {file}, {len(result['segments'])} found.")
@ -557,7 +563,7 @@ def import_voice(file, saveAs = None):
path = f"{outdir}/{os.path.basename(filename)}" path = f"{outdir}/{os.path.basename(filename)}"
waveform, sampling_rate = torchaudio.load(filename) waveform, sampling_rate = torchaudio.load(filename)
if args.voice_fixer: if args.voice_fixer and voicefixer is not None:
# resample to best bandwidth since voicefixer will do it anyways through librosa # resample to best bandwidth since voicefixer will do it anyways through librosa
if sampling_rate != 44100: if sampling_rate != 44100:
print(f"Resampling imported voice sample: {path}") print(f"Resampling imported voice sample: {path}")
@ -714,6 +720,7 @@ def export_exec_settings( listen, share, check_for_updates, models_from_local_on
'output-volume': args.output_volume, 'output-volume': args.output_volume,
} }
os.makedirs('./config/', exist_ok=True)
with open(f'./config/exec.json', 'w', encoding="utf-8") as f: with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
f.write(json.dumps(settings, indent='\t') ) f.write(json.dumps(settings, indent='\t') )