diff --git a/README.md b/README.md index bb8e5f9..55b83ac 100755 --- a/README.md +++ b/README.md @@ -115,6 +115,8 @@ To save you from headaches, I strongly recommend playing around with shorter sen As a quick optimization, I modified the script to where the `conditional_latents` are saved after loading voice samples, and subsequent uses will load that file directly (at the cost of not returning the `Sample voice` to the web UI). If there's voice samples that have a modification time newer than this cached file, it'll skip loading it and load the normal WAVs instead. +**!**NOTE**!**: cached `latents.pth` files generated before 2023.02.05 will be ignored, due to a change in computing the conditiona latents. This *should* help bump up voice cloning quality. Apologies for the inconvenience. + ## Example(s) Below are some outputs I deem substantial enough to share. As I continue delving into TorToiSe, I'll supply more examples and the values I use. diff --git a/app.py b/app.py index 75869bf..0f38687 100755 --- a/app.py +++ b/app.py @@ -10,7 +10,9 @@ from tortoise.api import TextToSpeech from tortoise.utils.audio import load_audio, load_voice, load_voices from tortoise.utils.text import split_and_recombine_text -def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, progress=gr.Progress()): +def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, candidates, num_autoregressive_samples, diffusion_iterations, temperature, diffusion_sampler, breathing_room, experimentals, progress=gr.Progress()): + print(experimentals) + if voice != "microphone": voices = [voice] else: @@ -27,7 +29,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c if voice_samples is not None: sample_voice = voice_samples[0] - conditioning_latents = tts.get_conditioning_latents(voice_samples) + conditioning_latents = tts.get_conditioning_latents(voice_samples, progress=progress) torch.save(conditioning_latents, os.path.join(f'./tortoise/voices/{voice}/', f'latents.pth')) voice_samples = None else: @@ -54,6 +56,8 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, preset, seed, c 'diffusion_sampler': diffusion_sampler, 'breathing_room': breathing_room, 'progress': progress, + 'half_p': "Half Precision" in experimentals, + 'cond_free': "Conditioning-Free" in experimentals, } if delimiter == "\\n": @@ -216,6 +220,8 @@ def main(): type="value", ) + experimentals = gr.CheckboxGroup(["Half Precision", "Conditioning-Free"], value=[False, True], label="Experimental Flags") + preset.change(fn=update_presets, inputs=preset, outputs=[ @@ -246,7 +252,8 @@ def main(): diffusion_iterations, temperature, diffusion_sampler, - breathing_room + breathing_room, + experimentals, ], outputs=[selected_voice, output_audio, usedSeed], ) diff --git a/start.bat b/start.bat index f784640..33bc1b3 100755 --- a/start.bat +++ b/start.bat @@ -1,3 +1,4 @@ call .\tortoise-venv\Scripts\activate.bat -py .\app.py -deactivate \ No newline at end of file +python .\app.py +deactivate +pause \ No newline at end of file diff --git a/tortoise/api.py b/tortoise/api.py index 37a8ec5..1a003fc 100755 --- a/tortoise/api.py +++ b/tortoise/api.py @@ -284,7 +284,7 @@ class TextToSpeech: if self.minor_optimizations: self.cvvp = self.cvvp.to(self.device) - def get_conditioning_latents(self, voice_samples, return_mels=False): + def get_conditioning_latents(self, voice_samples, return_mels=False, verbose=False, progress=None, enforced_length=102400): """ Transforms one or more voice_samples into a tuple (autoregressive_conditioning_latent, diffusion_conditioning_latent). These are expressive learned latents that encode aspects of the provided clips like voice, intonation, and acoustic @@ -303,14 +303,18 @@ class TextToSpeech: auto_conds = torch.stack(auto_conds, dim=1) diffusion_conds = [] - for sample in voice_samples: + + for sample in tqdm_override(voice_samples, verbose=verbose, progress=progress, desc="Computing conditioning latents..."): # The diffuser operates at a sample rate of 24000 (except for the latent inputs) sample = torchaudio.functional.resample(sample, 22050, 24000) - sample = pad_or_truncate(sample, 102400) - cond_mel = wav_to_univnet_mel(sample.to(self.device), do_normalization=False, device=self.device) - diffusion_conds.append(cond_mel) - diffusion_conds = torch.stack(diffusion_conds, dim=1) + chunks = torch.chunk(sample, int(sample.shape[-1] / enforced_length) + 1, dim=1) + + for chunk in chunks: + chunk = pad_or_truncate(chunk, enforced_length) + cond_mel = wav_to_univnet_mel(chunk.to(self.device), do_normalization=False, device=self.device) + diffusion_conds.append(cond_mel) + diffusion_conds = torch.stack(diffusion_conds, dim=1) if self.minor_optimizations: auto_latent = self.autoregressive.get_conditioning(auto_conds) @@ -372,6 +376,7 @@ class TextToSpeech: diffusion_iterations=100, cond_free=True, cond_free_k=2, diffusion_temperature=1.0, diffusion_sampler="P", breathing_room=8, + half_p=False, progress=None, **hf_generate_kwargs): """ @@ -446,55 +451,57 @@ class TextToSpeech: if not self.minor_optimizations: self.autoregressive = self.autoregressive.to(self.device) - for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"): - codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, - do_sample=True, - top_p=top_p, - temperature=temperature, - num_return_sequences=self.autoregressive_batch_size, - length_penalty=length_penalty, - repetition_penalty=repetition_penalty, - max_generate_length=max_mel_tokens, - **hf_generate_kwargs) - padding_needed = max_mel_tokens - codes.shape[1] - codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) - samples.append(codes) + with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): + for b in tqdm_override(range(num_batches), verbose=verbose, progress=progress, desc="Generating autoregressive samples"): + codes = self.autoregressive.inference_speech(auto_conditioning, text_tokens, + do_sample=True, + top_p=top_p, + temperature=temperature, + num_return_sequences=self.autoregressive_batch_size, + length_penalty=length_penalty, + repetition_penalty=repetition_penalty, + max_generate_length=max_mel_tokens, + **hf_generate_kwargs) + padding_needed = max_mel_tokens - codes.shape[1] + codes = F.pad(codes, (0, padding_needed), value=stop_mel_token) + samples.append(codes) clip_results = [] - if not self.minor_optimizations: - self.autoregressive = self.autoregressive.cpu() - self.clvp = self.clvp.to(self.device) - - if cvvp_amount > 0: - if self.cvvp is None: - self.load_cvvp() + with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=half_p): if not self.minor_optimizations: - self.cvvp = self.cvvp.to(self.device) - - desc="Computing best candidates" - if verbose: - if self.cvvp is None: - desc = "Computing best candidates using CLVP" - else: - desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%" + self.autoregressive = self.autoregressive.cpu() + self.clvp = self.clvp.to(self.device) - for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc): - for i in range(batch.shape[0]): - batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) - if cvvp_amount != 1: - clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) - if auto_conds is not None and cvvp_amount > 0: - cvvp_accumulator = 0 - for cl in range(auto_conds.shape[1]): - cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) - cvvp = cvvp_accumulator / auto_conds.shape[1] - if cvvp_amount == 1: - clip_results.append(cvvp) + if cvvp_amount > 0: + if self.cvvp is None: + self.load_cvvp() + if not self.minor_optimizations: + self.cvvp = self.cvvp.to(self.device) + + desc="Computing best candidates" + if verbose: + if self.cvvp is None: + desc = "Computing best candidates using CLVP" else: - clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount)) - else: - clip_results.append(clvp) + desc = f"Computing best candidates using CLVP {((1-cvvp_amount) * 100):2.0f}% and CVVP {(cvvp_amount * 100):2.0f}%" + + for batch in tqdm_override(samples, verbose=verbose, progress=progress, desc=desc): + for i in range(batch.shape[0]): + batch[i] = fix_autoregressive_output(batch[i], stop_mel_token) + if cvvp_amount != 1: + clvp = self.clvp(text_tokens.repeat(batch.shape[0], 1), batch, return_loss=False) + if auto_conds is not None and cvvp_amount > 0: + cvvp_accumulator = 0 + for cl in range(auto_conds.shape[1]): + cvvp_accumulator = cvvp_accumulator + self.cvvp(auto_conds[:, cl].repeat(batch.shape[0], 1, 1), batch, return_loss=False) + cvvp = cvvp_accumulator / auto_conds.shape[1] + if cvvp_amount == 1: + clip_results.append(cvvp) + else: + clip_results.append(cvvp * cvvp_amount + clvp * (1-cvvp_amount)) + else: + clip_results.append(clvp) clip_results = torch.cat(clip_results, dim=0) samples = torch.cat(samples, dim=0) best_results = samples[torch.topk(clip_results, k=k).indices] diff --git a/tortoise/utils/audio.py b/tortoise/utils/audio.py index 6a2f77d..b2a5cfd 100755 --- a/tortoise/utils/audio.py +++ b/tortoise/utils/audio.py @@ -108,8 +108,11 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True): voices = [] latent = None for file in paths: - if file[-4:] == ".pth": + if file == "cond_latents.pth": latent = file + elif file[-4:] == ".pth": + {} + # noop else: voices.append(file) mtime = max(mtime, os.path.getmtime(file)) diff --git a/update.bat b/update.bat new file mode 100755 index 0000000..699cd5f --- /dev/null +++ b/update.bat @@ -0,0 +1,7 @@ +git pull +python -m venv tortoise-venv +call .\tortoise-venv\Scripts\activate.bat +python -m pip install --upgrade pip +python -m pip install -r ./requirements.txt +deactivate +pause \ No newline at end of file