forked from mrq/ai-voice-cloning
small fixes
This commit is contained in:
parent
f708909687
commit
ad4adc960f
|
@ -12,4 +12,4 @@ This is not endorsed by [neonbjb](https://github.com/neonbjb/). I do not expect
|
||||||
|
|
||||||
## Documentation
|
## Documentation
|
||||||
|
|
||||||
Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for documentation.
|
Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for the documentation, including how to install, prepare voices for, and use the software.
|
21
src/utils.py
21
src/utils.py
|
@ -55,7 +55,7 @@ def setup_args():
|
||||||
'sample-batch-size': None,
|
'sample-batch-size': None,
|
||||||
'embed-output-metadata': True,
|
'embed-output-metadata': True,
|
||||||
'latents-lean-and-mean': True,
|
'latents-lean-and-mean': True,
|
||||||
'voice-fixer': True,
|
'voice-fixer': False, # getting tired of long initialization times in a Colab for downloading a large dataset for it
|
||||||
'voice-fixer-use-cuda': True,
|
'voice-fixer-use-cuda': True,
|
||||||
'force-cpu-for-conditioning-latents': False,
|
'force-cpu-for-conditioning-latents': False,
|
||||||
'device-override': None,
|
'device-override': None,
|
||||||
|
@ -167,7 +167,7 @@ def generate(
|
||||||
progress(0, desc="Loading voice...")
|
progress(0, desc="Loading voice...")
|
||||||
voice_samples, conditioning_latents = load_voice(voice)
|
voice_samples, conditioning_latents = load_voice(voice)
|
||||||
|
|
||||||
if voice_samples is not None:
|
if voice_samples is not None and len(voice_samples) > 0:
|
||||||
sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu()
|
sample_voice = torch.cat(voice_samples, dim=-1).squeeze().cpu()
|
||||||
|
|
||||||
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
|
conditioning_latents = tts.get_conditioning_latents(voice_samples, return_mels=not args.latents_lean_and_mean, progress=progress, slices=voice_latents_chunks, force_cpu=args.force_cpu_for_conditioning_latents)
|
||||||
|
@ -374,7 +374,7 @@ def generate(
|
||||||
with open(f'{outdir}/{voice}_{name}.json', 'w', encoding="utf-8") as f:
|
with open(f'{outdir}/{voice}_{name}.json', 'w', encoding="utf-8") as f:
|
||||||
f.write(json.dumps(info, indent='\t') )
|
f.write(json.dumps(info, indent='\t') )
|
||||||
|
|
||||||
if args.voice_fixer and voicefixer:
|
if args.voice_fixer and voicefixer is not None:
|
||||||
fixed_output_voices = []
|
fixed_output_voices = []
|
||||||
for path in progress.tqdm(output_voices, desc="Running voicefix..."):
|
for path in progress.tqdm(output_voices, desc="Running voicefix..."):
|
||||||
fixed = path.replace(".wav", "_fixed.wav")
|
fixed = path.replace(".wav", "_fixed.wav")
|
||||||
|
@ -409,6 +409,7 @@ def generate(
|
||||||
if 'latents' in info:
|
if 'latents' in info:
|
||||||
del info['latents']
|
del info['latents']
|
||||||
|
|
||||||
|
os.makedirs('./config/', exist_ok=True)
|
||||||
with open(f'./config/generate.json', 'w', encoding="utf-8") as f:
|
with open(f'./config/generate.json', 'w', encoding="utf-8") as f:
|
||||||
f.write(json.dumps(info, indent='\t') )
|
f.write(json.dumps(info, indent='\t') )
|
||||||
|
|
||||||
|
@ -422,13 +423,18 @@ def generate(
|
||||||
stats,
|
stats,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
import subprocess
|
||||||
|
|
||||||
def run_training(config_path):
|
def run_training(config_path):
|
||||||
|
print("Unloading TTS to save VRAM.")
|
||||||
global tts
|
global tts
|
||||||
del tts
|
del tts
|
||||||
tts = None
|
tts = None
|
||||||
|
|
||||||
import subprocess
|
cmd = ["python", "./src/train.py", "-opt", config_path]
|
||||||
subprocess.run(["python", "./src/train.py", "-opt", config_path], env=os.environ.copy(), shell=True, stdout=subprocess.PIPE)
|
|
||||||
|
print("Spawning process: ", " ".join(cmd))
|
||||||
|
subprocess.run(cmd, env=os.environ.copy(), shell=True, stdout=subprocess.STDOUT, stderr=subprocess.STDOUT)
|
||||||
"""
|
"""
|
||||||
from train import train
|
from train import train
|
||||||
train(config)
|
train(config)
|
||||||
|
@ -501,7 +507,7 @@ def prepare_dataset( files, outdir, language=None ):
|
||||||
for file in files:
|
for file in files:
|
||||||
print(f"Transcribing file: {file}")
|
print(f"Transcribing file: {file}")
|
||||||
|
|
||||||
result = whisper_model.transcribe(file, language=language)
|
result = whisper_model.transcribe(file, language=language if language else "English")
|
||||||
results[os.path.basename(file)] = result
|
results[os.path.basename(file)] = result
|
||||||
|
|
||||||
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
print(f"Transcribed file: {file}, {len(result['segments'])} found.")
|
||||||
|
@ -557,7 +563,7 @@ def import_voice(file, saveAs = None):
|
||||||
path = f"{outdir}/{os.path.basename(filename)}"
|
path = f"{outdir}/{os.path.basename(filename)}"
|
||||||
waveform, sampling_rate = torchaudio.load(filename)
|
waveform, sampling_rate = torchaudio.load(filename)
|
||||||
|
|
||||||
if args.voice_fixer:
|
if args.voice_fixer and voicefixer is not None:
|
||||||
# resample to best bandwidth since voicefixer will do it anyways through librosa
|
# resample to best bandwidth since voicefixer will do it anyways through librosa
|
||||||
if sampling_rate != 44100:
|
if sampling_rate != 44100:
|
||||||
print(f"Resampling imported voice sample: {path}")
|
print(f"Resampling imported voice sample: {path}")
|
||||||
|
@ -714,6 +720,7 @@ def export_exec_settings( listen, share, check_for_updates, models_from_local_on
|
||||||
'output-volume': args.output_volume,
|
'output-volume': args.output_volume,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
os.makedirs('./config/', exist_ok=True)
|
||||||
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
with open(f'./config/exec.json', 'w', encoding="utf-8") as f:
|
||||||
f.write(json.dumps(settings, indent='\t') )
|
f.write(json.dumps(settings, indent='\t') )
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user