forked from mrq/tortoise-tts
un-hardcoded input output sampling rates (changing them "works" but leads to wrong audio, naturally)
This commit is contained in:
parent
55058675d2
commit
f7274112c3
16
app.py
16
app.py
|
@ -27,7 +27,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
if voice == "microphone":
|
if voice == "microphone":
|
||||||
if mic_audio is None:
|
if mic_audio is None:
|
||||||
raise gr.Error("Please provide audio from mic when choosing `microphone` as a voice input")
|
raise gr.Error("Please provide audio from mic when choosing `microphone` as a voice input")
|
||||||
mic = load_audio(mic_audio, 22050)
|
mic = load_audio(mic_audio, tts.input_sample_rate)
|
||||||
voice_samples, conditioning_latents = [mic], None
|
voice_samples, conditioning_latents = [mic], None
|
||||||
else:
|
else:
|
||||||
progress(0, desc="Loading voice...")
|
progress(0, desc="Loading voice...")
|
||||||
|
@ -105,14 +105,14 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
}
|
}
|
||||||
|
|
||||||
os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
|
os.makedirs(f'{outdir}/candidate_{j}', exist_ok=True)
|
||||||
torchaudio.save(f'{outdir}/candidate_{j}/result_{line}.wav', audio, 24000)
|
torchaudio.save(f'{outdir}/candidate_{j}/result_{line}.wav', audio, tts.output_sample_rate)
|
||||||
else:
|
else:
|
||||||
audio = gen.squeeze(0).cpu()
|
audio = gen.squeeze(0).cpu()
|
||||||
audio_cache[f"result_{line}.wav"] = {
|
audio_cache[f"result_{line}.wav"] = {
|
||||||
'audio': audio,
|
'audio': audio,
|
||||||
'text': cut_text,
|
'text': cut_text,
|
||||||
}
|
}
|
||||||
torchaudio.save(f'{outdir}/result_{line}.wav', audio, 24000)
|
torchaudio.save(f'{outdir}/result_{line}.wav', audio, tts.output_sample_rate)
|
||||||
|
|
||||||
output_voice = None
|
output_voice = None
|
||||||
if len(texts) > 1:
|
if len(texts) > 1:
|
||||||
|
@ -126,7 +126,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
audio_clips.append(audio)
|
audio_clips.append(audio)
|
||||||
|
|
||||||
audio = torch.cat(audio_clips, dim=-1)
|
audio = torch.cat(audio_clips, dim=-1)
|
||||||
torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, 24000)
|
torchaudio.save(f'{outdir}/combined_{candidate}.wav', audio, tts.output_sample_rate)
|
||||||
|
|
||||||
audio = audio.squeeze(0).cpu()
|
audio = audio.squeeze(0).cpu()
|
||||||
audio_cache[f'combined_{candidate}.wav'] = {
|
audio_cache[f'combined_{candidate}.wav'] = {
|
||||||
|
@ -143,7 +143,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
output_voice = gen
|
output_voice = gen
|
||||||
|
|
||||||
if output_voice is not None:
|
if output_voice is not None:
|
||||||
output_voice = (24000, output_voice.numpy())
|
output_voice = (tts.output_sample_rate, output_voice.numpy())
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
'text': text,
|
'text': text,
|
||||||
|
@ -179,7 +179,7 @@ def generate(text, delimiter, emotion, prompt, voice, mic_audio, seed, candidate
|
||||||
metadata.save()
|
metadata.save()
|
||||||
|
|
||||||
if sample_voice is not None:
|
if sample_voice is not None:
|
||||||
sample_voice = (22050, sample_voice.squeeze().cpu().numpy())
|
sample_voice = (tts.input_sample_rate, sample_voice.squeeze().cpu().numpy())
|
||||||
|
|
||||||
print(f"Generation took {info['time']} seconds, saved to '{outdir}'\n")
|
print(f"Generation took {info['time']} seconds, saved to '{outdir}'\n")
|
||||||
|
|
||||||
|
@ -514,6 +514,8 @@ if __name__ == "__main__":
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
print("Initializating TorToiSe...")
|
print("Initializating TorToiSe...")
|
||||||
tts = TextToSpeech(minor_optimizations=not args.low_vram)
|
tts = TextToSpeech(
|
||||||
|
minor_optimizations=not args.low_vram,
|
||||||
|
)
|
||||||
|
|
||||||
main()
|
main()
|
|
@ -114,7 +114,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
|
||||||
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
conditioning_free=cond_free, conditioning_free_k=cond_free_k)
|
||||||
|
|
||||||
|
|
||||||
def format_conditioning(clip, cond_length=132300, device='cuda'):
|
def format_conditioning(clip, cond_length=132300, device='cuda', sampling_rate=22050):
|
||||||
"""
|
"""
|
||||||
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
|
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
|
||||||
"""
|
"""
|
||||||
|
@ -124,7 +124,7 @@ def format_conditioning(clip, cond_length=132300, device='cuda'):
|
||||||
elif gap > 0:
|
elif gap > 0:
|
||||||
rand_start = random.randint(0, gap)
|
rand_start = random.randint(0, gap)
|
||||||
clip = clip[:, rand_start:rand_start + cond_length]
|
clip = clip[:, rand_start:rand_start + cond_length]
|
||||||
mel_clip = TorchMelSpectrogram()(clip.unsqueeze(0)).squeeze(0)
|
mel_clip = TorchMelSpectrogram(sampling_rate=sample_rate)(clip.unsqueeze(0)).squeeze(0)
|
||||||
return mel_clip.unsqueeze(0).to(device)
|
return mel_clip.unsqueeze(0).to(device)
|
||||||
|
|
||||||
|
|
||||||
|
@ -158,12 +158,12 @@ def fix_autoregressive_output(codes, stop_token, complain=True):
|
||||||
return codes
|
return codes
|
||||||
|
|
||||||
|
|
||||||
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None, sampler="P"):
|
def do_spectrogram_diffusion(diffusion_model, diffuser, latents, conditioning_latents, temperature=1, verbose=True, progress=None, desc=None, sampler="P", input_sample_rate=22050, output_sample_rate=24000):
|
||||||
"""
|
"""
|
||||||
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
Uses the specified diffusion model to convert discrete codes into a spectrogram.
|
||||||
"""
|
"""
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
output_seq_len = latents.shape[1] * 4 * 24000 // 22050 # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
output_seq_len = latents.shape[1] * 4 * output_sample_rate // input_sample_rate # This diffusion model converts from 22kHz spectrogram codes to a 24kHz spectrogram signal.
|
||||||
output_shape = (latents.shape[0], 100, output_seq_len)
|
output_shape = (latents.shape[0], 100, output_seq_len)
|
||||||
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
|
precomputed_embeddings = diffusion_model.timestep_independent(latents, conditioning_latents, output_seq_len, False)
|
||||||
|
|
||||||
|
@ -214,7 +214,7 @@ class TextToSpeech:
|
||||||
Main entry point into Tortoise.
|
Main entry point into Tortoise.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None, minor_optimizations=True):
|
def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable_redaction=True, device=None, minor_optimizations=True, input_sample_rate=22050, output_sample_rate=24000):
|
||||||
"""
|
"""
|
||||||
Constructor
|
Constructor
|
||||||
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
|
||||||
|
@ -234,7 +234,10 @@ class TextToSpeech:
|
||||||
if device is None:
|
if device is None:
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
|
|
||||||
|
self.input_sample_rate = input_sample_rate
|
||||||
|
self.output_sample_rate = output_sample_rate
|
||||||
self.minor_optimizations = minor_optimizations
|
self.minor_optimizations = minor_optimizations
|
||||||
|
|
||||||
self.models_dir = models_dir
|
self.models_dir = models_dir
|
||||||
self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size
|
self.autoregressive_batch_size = pick_best_batch_size_for_gpu() if autoregressive_batch_size is None or autoregressive_batch_size == 0 else autoregressive_batch_size
|
||||||
self.enable_redaction = enable_redaction
|
self.enable_redaction = enable_redaction
|
||||||
|
@ -306,7 +309,7 @@ class TextToSpeech:
|
||||||
if not isinstance(voice_samples, list):
|
if not isinstance(voice_samples, list):
|
||||||
voice_samples = [voice_samples]
|
voice_samples = [voice_samples]
|
||||||
for vs in voice_samples:
|
for vs in voice_samples:
|
||||||
auto_conds.append(format_conditioning(vs, device=self.device))
|
auto_conds.append(format_conditioning(vs, device=self.device, sampling_rate=self.input_sample_rate))
|
||||||
|
|
||||||
auto_conds = torch.stack(auto_conds, dim=1)
|
auto_conds = torch.stack(auto_conds, dim=1)
|
||||||
|
|
||||||
|
@ -315,7 +318,8 @@ class TextToSpeech:
|
||||||
samples = [] # resample in its own pass to make things easier
|
samples = [] # resample in its own pass to make things easier
|
||||||
for sample in voice_samples:
|
for sample in voice_samples:
|
||||||
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
|
||||||
samples.append(torchaudio.functional.resample(sample, 22050, 24000))
|
#samples.append(torchaudio.functional.resample(sample, 22050, 24000))
|
||||||
|
samples.append(torchaudio.functional.resample(sample, self.input_sample_rate, self.output_sample_rate))
|
||||||
|
|
||||||
if chunk_size is None:
|
if chunk_size is None:
|
||||||
for sample in tqdm_override(samples, verbose=verbose and len(samples) > 1, progress=progress if len(samples) > 1 else None, desc="Calculating size of best fit..."):
|
for sample in tqdm_override(samples, verbose=verbose and len(samples) > 1, progress=progress if len(samples) > 1 else None, desc="Calculating size of best fit..."):
|
||||||
|
@ -582,7 +586,8 @@ class TextToSpeech:
|
||||||
break
|
break
|
||||||
|
|
||||||
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
mel = do_spectrogram_diffusion(self.diffusion, diffuser, latents, diffusion_conditioning,
|
||||||
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler)
|
temperature=diffusion_temperature, verbose=verbose, progress=progress, desc="Transforming autoregressive outputs into audio..", sampler=diffusion_sampler,
|
||||||
|
input_sample_rate=self.input_sample_rate, output_sample_rate=self.output_sample_rate)
|
||||||
wav = self.vocoder.inference(mel)
|
wav = self.vocoder.inference(mel)
|
||||||
wav_candidates.append(wav.cpu())
|
wav_candidates.append(wav.cpu())
|
||||||
|
|
||||||
|
@ -592,7 +597,7 @@ class TextToSpeech:
|
||||||
|
|
||||||
def potentially_redact(clip, text):
|
def potentially_redact(clip, text):
|
||||||
if self.enable_redaction:
|
if self.enable_redaction:
|
||||||
return self.aligner.redact(clip.squeeze(1), text).unsqueeze(1)
|
return self.aligner.redact(clip.squeeze(1), text, self.output_sample_rate).unsqueeze(1)
|
||||||
return clip
|
return clip
|
||||||
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
|
wav_candidates = [potentially_redact(wav_candidate, text) for wav_candidate in wav_candidates]
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ def get_voices(extra_voice_dirs=[]):
|
||||||
return voices
|
return voices
|
||||||
|
|
||||||
|
|
||||||
def load_voice(voice, extra_voice_dirs=[], load_latents=True):
|
def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050):
|
||||||
if voice == 'random':
|
if voice == 'random':
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
@ -125,7 +125,7 @@ def load_voice(voice, extra_voice_dirs=[], load_latents=True):
|
||||||
|
|
||||||
conds = []
|
conds = []
|
||||||
for cond_path in voices:
|
for cond_path in voices:
|
||||||
c = load_audio(cond_path, 22050)
|
c = load_audio(cond_path, sample_rate)
|
||||||
conds.append(c)
|
conds.append(c)
|
||||||
return conds, None
|
return conds, None
|
||||||
|
|
||||||
|
@ -197,8 +197,8 @@ class TacotronSTFT(torch.nn.Module):
|
||||||
return mel_output
|
return mel_output
|
||||||
|
|
||||||
|
|
||||||
def wav_to_univnet_mel(wav, do_normalization=False, device='cuda'):
|
def wav_to_univnet_mel(wav, do_normalization=False, device='cuda', sample_rate=24000):
|
||||||
stft = TacotronSTFT(1024, 256, 1024, 100, 24000, 0, 12000)
|
stft = TacotronSTFT(1024, 256, 1024, 100, sample_rate, 0, 12000)
|
||||||
stft = stft.to(device)
|
stft = stft.to(device)
|
||||||
mel = stft.mel_spectrogram(wav)
|
mel = stft.mel_spectrogram(wav)
|
||||||
if do_normalization:
|
if do_normalization:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user