tortoise-tts/tortoise/utils/audio.py

import os
from glob import glob

import librosa
import torch
import torchaudio
import numpy as np
from scipy.io.wavfile import read

from tortoise.utils.stft import STFT

def get_voice_dir():
    target = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices')
    if not os.path.exists(target):
        target = os.path.dirname('./voices/')

    os.makedirs(target, exist_ok=True)

    return target

def load_audio(audiopath, sampling_rate):
    if audiopath[-4:] == '.wav':
        audio, lsr = torchaudio.load(audiopath)
    elif audiopath[-4:] == '.mp3':
        audio, lsr = librosa.load(audiopath, sr=sampling_rate)
        audio = torch.FloatTensor(audio)
    else:
        assert False, f"Unsupported audio format provided: {audiopath[-4:]}"

    # Remove any channel data.
    if len(audio.shape) > 1:
        if audio.shape[0] < 5:
            audio = audio[0]
        else:
            assert audio.shape[1] < 5
            audio = audio[:, 0]

    if lsr != sampling_rate:
        audio = torchaudio.functional.resample(audio, lsr, sampling_rate)

    # Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.
    # '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.
    if torch.any(audio > 2) or not torch.any(audio < 0):
        print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")
    audio.clip_(-1, 1)

    return audio.unsqueeze(0)


TACOTRON_MEL_MAX = 2.3143386840820312
TACOTRON_MEL_MIN = -11.512925148010254


def denormalize_tacotron_mel(norm_mel):
    return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN


def normalize_tacotron_mel(mel):
    return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1


def dynamic_range_compression(x, C=1, clip_val=1e-5):
    """
    PARAMS
    ------
    C: compression factor
    """
    return torch.log(torch.clamp(x, min=clip_val) * C)


def dynamic_range_decompression(x, C=1):
    """
    PARAMS
    ------
    C: compression factor used to compress
    """
    return torch.exp(x) / C


def get_voices(extra_voice_dirs=[]):
    dirs = [get_voice_dir()] + extra_voice_dirs
    voices = {}
    for d in dirs:
        subs = os.listdir(d)
        for sub in subs:
            subj = os.path.join(d, sub)
            if os.path.isdir(subj):
                voices[sub] = list(glob(f'{subj}/*.wav')) + list(glob(f'{subj}/*.mp3')) + list(glob(f'{subj}/*.pth'))
    return voices


def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu'):
    if voice == 'random':
        return None, None

    voices = get_voices(extra_voice_dirs)
    paths = voices[voice]

    mtime = 0
    voices = []
    latent = None
    for file in paths:
        if file[-16:] == "cond_latents.pth":
            latent = file
        elif file[-4:] == ".pth":
            {}
            # noop
        else:
            voices.append(file)
            mtime = max(mtime, os.path.getmtime(file))

    if load_latents and latent is not None:
        if os.path.getmtime(latent) > mtime:
            print(f"Reading from latent: {latent}")
            return None, torch.load(latent, map_location=device)
        print(f"Latent file out of date: {latent}")
    
    samples = []
    for path in voices:
        c = load_audio(path, sample_rate)
        samples.append(c)
    return samples, None


def load_voices(voices, extra_voice_dirs=[]):
    latents = []
    clips = []
    for voice in voices:
        if voice == 'random':
            if len(voices) > 1:
                print("Cannot combine a random voice with a non-random voice. Just using a random voice.")
            return None, None
        clip, latent = load_voice(voice, extra_voice_dirs)
        if latent is None:
            assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
            clips.extend(clip)
        elif clip is None:
            assert len(clips) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."
            latents.append(latent)
    if len(latents) == 0:
        return clips, None
    else:
        latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0)
        latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0)
        latents = (latents_0,latents_1)
        return None, latents


class TacotronSTFT(torch.nn.Module):
    def __init__(self, filter_length=1024, hop_length=256, win_length=1024,
                 n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,
                 mel_fmax=8000.0):
        super(TacotronSTFT, self).__init__()
        self.n_mel_channels = n_mel_channels
        self.sampling_rate = sampling_rate
        self.stft_fn = STFT(filter_length, hop_length, win_length)
        from librosa.filters import mel as librosa_mel_fn
        mel_basis = librosa_mel_fn(
            sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)
        mel_basis = torch.from_numpy(mel_basis).float()
        self.register_buffer('mel_basis', mel_basis)

    def spectral_normalize(self, magnitudes):
        output = dynamic_range_compression(magnitudes)
        return output

    def spectral_de_normalize(self, magnitudes):
        output = dynamic_range_decompression(magnitudes)
        return output

    def mel_spectrogram(self, y):
        """Computes mel-spectrograms from a batch of waves
        PARAMS
        ------
        y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]

        RETURNS
        -------
        mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)
        """
        assert(torch.min(y.data) >= -10)
        assert(torch.max(y.data) <= 10)
        y = torch.clip(y, min=-1, max=1)

        magnitudes, phases = self.stft_fn.transform(y)
        magnitudes = magnitudes.data
        mel_output = torch.matmul(self.mel_basis, magnitudes)
        mel_output = self.spectral_normalize(mel_output)
        return mel_output


def wav_to_univnet_mel(wav, do_normalization=False, device='cpu', sample_rate=24000):
    stft = TacotronSTFT(1024, 256, 1024, 100, sample_rate, 0, 12000)
    stft = stft.to(device)
    mel = stft.mel_spectrogram(wav)
    if do_normalization:
        mel = normalize_tacotron_mel(mel)
    return mel
update 2022-04-15 14:26:11 +00:00			`import os`
			`from glob import glob`

Use librosa for loading mp3s 2022-05-04 02:44:31 +00:00			`import librosa`
Initial commit 2022-01-28 06:19:29 +00:00			`import torch`
			`import torchaudio`
Update docs 2022-02-04 05:18:21 +00:00			`import numpy as np`
			`from scipy.io.wavfile import read`
Initial commit 2022-01-28 06:19:29 +00:00
Move everything into the tortoise/ subdirectory For eventual packaging. 2022-05-01 22:24:24 +00:00			`from tortoise.utils.stft import STFT`
Modifications to support "v1.5" 2022-03-22 17:52:46 +00:00
Moved voices out of the tortoise folder because it kept being processed for setup.py 2023-02-10 20:11:56 +00:00			`def get_voice_dir():`
added reset generation settings to default button, revamped utilities tab to double as plain jane voice importer (and runs through voicefixer despite it not really doing anything if your voice samples are already of decent quality anyways), ditched load_wav_to_torch or whatever it was called because it literally exists as torchaudio.load, sample voice is now a combined waveform of all your samples and will always return even if using a latents file 2023-02-14 21:20:04 +00:00			`target = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../voices')`
			`if not os.path.exists(target):`
			`target = os.path.dirname('./voices/')`

			`os.makedirs(target, exist_ok=True)`
Initial commit 2022-01-28 06:19:29 +00:00
added reset generation settings to default button, revamped utilities tab to double as plain jane voice importer (and runs through voicefixer despite it not really doing anything if your voice samples are already of decent quality anyways), ditched load_wav_to_torch or whatever it was called because it literally exists as torchaudio.load, sample voice is now a combined waveform of all your samples and will always return even if using a latents file 2023-02-14 21:20:04 +00:00			`return target`
Initial commit 2022-01-28 06:19:29 +00:00
			`def load_audio(audiopath, sampling_rate):`
			`if audiopath[-4:] == '.wav':`
added reset generation settings to default button, revamped utilities tab to double as plain jane voice importer (and runs through voicefixer despite it not really doing anything if your voice samples are already of decent quality anyways), ditched load_wav_to_torch or whatever it was called because it literally exists as torchaudio.load, sample voice is now a combined waveform of all your samples and will always return even if using a latents file 2023-02-14 21:20:04 +00:00			`audio, lsr = torchaudio.load(audiopath)`
Initial commit 2022-01-28 06:19:29 +00:00			`elif audiopath[-4:] == '.mp3':`
Use librosa for loading mp3s 2022-05-04 02:44:31 +00:00			`audio, lsr = librosa.load(audiopath, sr=sampling_rate)`
Initial commit 2022-01-28 06:19:29 +00:00			`audio = torch.FloatTensor(audio)`
Add error message 2022-05-13 02:15:40 +00:00			`else:`
			`assert False, f"Unsupported audio format provided: {audiopath[-4:]}"`
Initial commit 2022-01-28 06:19:29 +00:00
			`# Remove any channel data.`
			`if len(audio.shape) > 1:`
			`if audio.shape[0] < 5:`
			`audio = audio[0]`
			`else:`
			`assert audio.shape[1] < 5`
			`audio = audio[:, 0]`

			`if lsr != sampling_rate:`
			`audio = torchaudio.functional.resample(audio, lsr, sampling_rate)`

			`# Check some assumptions about audio range. This should be automatically fixed in load_wav_to_torch, but might not be in some edge cases, where we should squawk.`
			`# '2' is arbitrarily chosen since it seems like audio will often "overdrive" the [-1,1] bounds.`
			`if torch.any(audio > 2) or not torch.any(audio < 0):`
			`print(f"Error with {audiopath}. Max={audio.max()} min={audio.min()}")`
			`audio.clip_(-1, 1)`

Modifications to support "v1.5" 2022-03-22 17:52:46 +00:00			`return audio.unsqueeze(0)`


			`TACOTRON_MEL_MAX = 2.3143386840820312`
			`TACOTRON_MEL_MIN = -11.512925148010254`


			`def denormalize_tacotron_mel(norm_mel):`
			`return ((norm_mel+1)/2)*(TACOTRON_MEL_MAX-TACOTRON_MEL_MIN)+TACOTRON_MEL_MIN`


			`def normalize_tacotron_mel(mel):`
			`return 2 * ((mel - TACOTRON_MEL_MIN) / (TACOTRON_MEL_MAX - TACOTRON_MEL_MIN)) - 1`


			`def dynamic_range_compression(x, C=1, clip_val=1e-5):`
			`"""`
			`PARAMS`
			`------`
			`C: compression factor`
			`"""`
			`return torch.log(torch.clamp(x, min=clip_val) * C)`


			`def dynamic_range_decompression(x, C=1):`
			`"""`
			`PARAMS`
			`------`
			`C: compression factor used to compress`
			`"""`
			`return torch.exp(x) / C`


Allow passing additional voice directories when loading voices 2022-05-19 11:35:57 +00:00			`def get_voices(extra_voice_dirs=[]):`
added reset generation settings to default button, revamped utilities tab to double as plain jane voice importer (and runs through voicefixer despite it not really doing anything if your voice samples are already of decent quality anyways), ditched load_wav_to_torch or whatever it was called because it literally exists as torchaudio.load, sample voice is now a combined waveform of all your samples and will always return even if using a latents file 2023-02-14 21:20:04 +00:00			`dirs = [get_voice_dir()] + extra_voice_dirs`
update 2022-04-15 14:26:11 +00:00			`voices = {}`
Allow passing additional voice directories when loading voices 2022-05-19 11:35:57 +00:00			`for d in dirs:`
			`subs = os.listdir(d)`
			`for sub in subs:`
			`subj = os.path.join(d, sub)`
			`if os.path.isdir(subj):`
			`voices[sub] = list(glob(f'{subj}/.wav')) + list(glob(f'{subj}/.mp3')) + list(glob(f'{subj}/*.pth'))`
update 2022-04-15 14:26:11 +00:00			`return voices`


owari da... 2023-02-09 01:53:25 +00:00			`def load_voice(voice, extra_voice_dirs=[], load_latents=True, sample_rate=22050, device='cpu'):`
Support totally random voices (and make fixes to previous changes) 2022-05-02 21:40:03 +00:00			`if voice == 'random':`
			`return None, None`

Allow passing additional voice directories when loading voices 2022-05-19 11:35:57 +00:00			`voices = get_voices(extra_voice_dirs)`
Add support for extracting and feeding conditioning latents directly into the model - Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py? 2022-05-01 23:25:18 +00:00			`paths = voices[voice]`
Added small optimization with caching latents, dropped Anaconda for just a py3.9 + pip + venv setup, added helper install scripts for such, cleaned up app.py, added flag '--low-vram' to disable minor optimizations 2023-02-04 01:50:57 +00:00
			`mtime = 0`
			`voices = []`
			`latent = None`
			`for file in paths:`
Forgot to rename the cached latents to the new filename 2023-02-05 23:51:52 +00:00			`if file[-16:] == "cond_latents.pth":`
Added small optimization with caching latents, dropped Anaconda for just a py3.9 + pip + venv setup, added helper install scripts for such, cleaned up app.py, added flag '--low-vram' to disable minor optimizations 2023-02-04 01:50:57 +00:00			`latent = file`
modified how conditional latents are computed (before, it just happened to only bother reading the first 102400/24000=4.26 seconds per audio input, now it will chunk it all to compute latents) 2023-02-05 23:25:41 +00:00			`elif file[-4:] == ".pth":`
			`{}`
			`# noop`
Added small optimization with caching latents, dropped Anaconda for just a py3.9 + pip + venv setup, added helper install scripts for such, cleaned up app.py, added flag '--low-vram' to disable minor optimizations 2023-02-04 01:50:57 +00:00			`else:`
			`voices.append(file)`
			`mtime = max(mtime, os.path.getmtime(file))`

			`if load_latents and latent is not None:`
			`if os.path.getmtime(latent) > mtime:`
			`print(f"Reading from latent: {latent}")`
owari da... 2023-02-09 01:53:25 +00:00			`return None, torch.load(latent, map_location=device)`
Added small optimization with caching latents, dropped Anaconda for just a py3.9 + pip + venv setup, added helper install scripts for such, cleaned up app.py, added flag '--low-vram' to disable minor optimizations 2023-02-04 01:50:57 +00:00			`print(f"Latent file out of date: {latent}")`

added reset generation settings to default button, revamped utilities tab to double as plain jane voice importer (and runs through voicefixer despite it not really doing anything if your voice samples are already of decent quality anyways), ditched load_wav_to_torch or whatever it was called because it literally exists as torchaudio.load, sample voice is now a combined waveform of all your samples and will always return even if using a latents file 2023-02-14 21:20:04 +00:00			`samples = []`
			`for path in voices:`
			`c = load_audio(path, sample_rate)`
			`samples.append(c)`
			`return samples, None`
Add support for extracting and feeding conditioning latents directly into the model - Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py? 2022-05-01 23:25:18 +00:00

Allow passing additional voice directories when loading voices 2022-05-19 11:35:57 +00:00			`def load_voices(voices, extra_voice_dirs=[]):`
Add support for extracting and feeding conditioning latents directly into the model - Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py? 2022-05-01 23:25:18 +00:00			`latents = []`
			`clips = []`
			`for voice in voices:`
more fixes 2022-05-02 22:44:47 +00:00			`if voice == 'random':`
Add tortoise_cli.py 2022-05-28 05:25:23 +00:00			`if len(voices) > 1:`
			`print("Cannot combine a random voice with a non-random voice. Just using a random voice.")`
more fixes 2022-05-02 22:44:47 +00:00			`return None, None`
Allow passing additional voice directories when loading voices 2022-05-19 11:35:57 +00:00			`clip, latent = load_voice(voice, extra_voice_dirs)`
Add support for extracting and feeding conditioning latents directly into the model - Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py? 2022-05-01 23:25:18 +00:00			`if latent is None:`
			`assert len(latents) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."`
			`clips.extend(clip)`
Fix bug in load_voices in audio.py The read.py script did not work with pth latents, so I fix bug in audio.py. It seems that in the elif statement, instead of voice, voices should be clip, clips. And torch stack doesn't work with tuples, so I had to split this operation. 2022-05-17 15:34:54 +00:00			`elif clip is None:`
			`assert len(clips) == 0, "Can only combine raw audio voices or latent voices, not both. Do it yourself if you want this."`
Add support for extracting and feeding conditioning latents directly into the model - Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py? 2022-05-01 23:25:18 +00:00			`latents.append(latent)`
			`if len(latents) == 0:`
more fixes 2022-05-02 22:44:47 +00:00			`return clips, None`
Add support for extracting and feeding conditioning latents directly into the model - Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py? 2022-05-01 23:25:18 +00:00			`else:`
Fix bug in load_voices in audio.py The read.py script did not work with pth latents, so I fix bug in audio.py. It seems that in the elif statement, instead of voice, voices should be clip, clips. And torch stack doesn't work with tuples, so I had to split this operation. 2022-05-17 15:34:54 +00:00			`latents_0 = torch.stack([l[0] for l in latents], dim=0).mean(dim=0)`
			`latents_1 = torch.stack([l[1] for l in latents], dim=0).mean(dim=0)`
			`latents = (latents_0,latents_1)`
			`return None, latents`
Add support for extracting and feeding conditioning latents directly into the model - Adds a new script and API endpoints for doing this - Reworks autoregressive and diffusion models so that the conditioning is computed separately (which will actually provide a mild performance boost) - Updates README This is untested. Need to do the following manual tests (and someday write unit tests for this behemoth before it becomes a problem..) 1) Does get_conditioning_latents.py work? 2) Can I feed those latents back into the model by creating a new voice? 3) Can I still mix and match voices (both with conditioning latents and normal voices) with read.py? 2022-05-01 23:25:18 +00:00

Modifications to support "v1.5" 2022-03-22 17:52:46 +00:00			`class TacotronSTFT(torch.nn.Module):`
			`def __init__(self, filter_length=1024, hop_length=256, win_length=1024,`
			`n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0,`
			`mel_fmax=8000.0):`
			`super(TacotronSTFT, self).__init__()`
			`self.n_mel_channels = n_mel_channels`
			`self.sampling_rate = sampling_rate`
			`self.stft_fn = STFT(filter_length, hop_length, win_length)`
			`from librosa.filters import mel as librosa_mel_fn`
			`mel_basis = librosa_mel_fn(`
v2.2 2022-05-06 06:11:10 +00:00			`sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax)`
Modifications to support "v1.5" 2022-03-22 17:52:46 +00:00			`mel_basis = torch.from_numpy(mel_basis).float()`
			`self.register_buffer('mel_basis', mel_basis)`

			`def spectral_normalize(self, magnitudes):`
			`output = dynamic_range_compression(magnitudes)`
			`return output`

			`def spectral_de_normalize(self, magnitudes):`
			`output = dynamic_range_decompression(magnitudes)`
			`return output`

			`def mel_spectrogram(self, y):`
			`"""Computes mel-spectrograms from a batch of waves`
			`PARAMS`
			`------`
			`y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1]`

			`RETURNS`
			`-------`
			`mel_output: torch.FloatTensor of shape (B, n_mel_channels, T)`
			`"""`
			`assert(torch.min(y.data) >= -10)`
			`assert(torch.max(y.data) <= 10)`
			`y = torch.clip(y, min=-1, max=1)`

			`magnitudes, phases = self.stft_fn.transform(y)`
			`magnitudes = magnitudes.data`
			`mel_output = torch.matmul(self.mel_basis, magnitudes)`
			`mel_output = self.spectral_normalize(mel_output)`
			`return mel_output`


owari da... 2023-02-09 01:53:25 +00:00			`def wav_to_univnet_mel(wav, do_normalization=False, device='cpu', sample_rate=24000):`
un-hardcoded input output sampling rates (changing them "works" but leads to wrong audio, naturally) 2023-02-07 18:34:29 +00:00			`stft = TacotronSTFT(1024, 256, 1024, 100, sample_rate, 0, 12000)`
Allow running on CPU 2022-06-11 11:03:14 +00:00			`stft = stft.to(device)`
Modifications to support "v1.5" 2022-03-22 17:52:46 +00:00			`mel = stft.mel_spectrogram(wav)`
			`if do_normalization:`
			`mel = normalize_tacotron_mel(mel)`
Fix bug in load_voices in audio.py The read.py script did not work with pth latents, so I fix bug in audio.py. It seems that in the elif statement, instead of voice, voices should be clip, clips. And torch stack doesn't work with tuples, so I had to split this operation. 2022-05-17 15:34:54 +00:00			`return mel`