uses gitmylo/bark-voice-cloning-HuBERT-quantizer for creating custom voices (it slightly works better over the base method, but still not very good desu)
This commit is contained in:
parent
547e1d1277
commit
6c3f48efba
68
src/utils.py
68
src/utils.py
|
@ -42,9 +42,6 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_
|
||||||
from tortoise.utils.text import split_and_recombine_text
|
from tortoise.utils.text import split_and_recombine_text
|
||||||
from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc
|
from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc
|
||||||
|
|
||||||
from whisper.normalizers.english import EnglishTextNormalizer
|
|
||||||
from whisper.normalizers.basic import BasicTextNormalizer
|
|
||||||
from whisper.tokenizer import LANGUAGES
|
|
||||||
|
|
||||||
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"
|
||||||
|
|
||||||
|
@ -68,6 +65,19 @@ MAX_TRAINING_DURATION = 11.6097505669
|
||||||
VALLE_ENABLED = False
|
VALLE_ENABLED = False
|
||||||
BARK_ENABLED = False
|
BARK_ENABLED = False
|
||||||
|
|
||||||
|
VERBOSE_DEBUG = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
from whisper.normalizers.english import EnglishTextNormalizer
|
||||||
|
from whisper.normalizers.basic import BasicTextNormalizer
|
||||||
|
from whisper.tokenizer import LANGUAGES
|
||||||
|
|
||||||
|
print("Whisper detected")
|
||||||
|
except Exception as e:
|
||||||
|
if VERBOSE_DEBUG:
|
||||||
|
print("Error:", e)
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vall_e.emb.qnt import encode as valle_quantize
|
from vall_e.emb.qnt import encode as valle_quantize
|
||||||
from vall_e.emb.g2p import encode as valle_phonemize
|
from vall_e.emb.g2p import encode as valle_phonemize
|
||||||
|
@ -76,10 +86,11 @@ try:
|
||||||
|
|
||||||
import soundfile
|
import soundfile
|
||||||
|
|
||||||
|
print("VALL-E detected")
|
||||||
VALLE_ENABLED = True
|
VALLE_ENABLED = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if False: # args.tts_backend == "vall-e":
|
if VERBOSE_DEBUG:
|
||||||
raise e
|
print("Error:", e)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if VALLE_ENABLED:
|
if VALLE_ENABLED:
|
||||||
|
@ -93,27 +104,39 @@ try:
|
||||||
|
|
||||||
from scipy.io.wavfile import write as write_wav
|
from scipy.io.wavfile import write as write_wav
|
||||||
|
|
||||||
|
print("Bark detected")
|
||||||
BARK_ENABLED = True
|
BARK_ENABLED = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if False: # args.tts_backend == "bark":
|
if VERBOSE_DEBUG:
|
||||||
raise e
|
print("Error:", e)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if BARK_ENABLED:
|
if BARK_ENABLED:
|
||||||
try:
|
try:
|
||||||
from vocos import Vocos
|
from vocos import Vocos
|
||||||
VOCOS_ENABLED = True
|
VOCOS_ENABLED = True
|
||||||
|
print("Vocos detected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if VERBOSE_DEBUG:
|
||||||
|
print("Error:", e)
|
||||||
VOCOS_ENABLED = False
|
VOCOS_ENABLED = False
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from hubert.hubert_manager import HuBERTManager
|
from hubert.hubert_manager import HuBERTManager
|
||||||
|
from hubert.pre_kmeans_hubert import CustomHubert
|
||||||
|
from hubert.customtokenizer import CustomTokenizer
|
||||||
|
|
||||||
|
hubert_manager = HuBERTManager()
|
||||||
|
hubert_manager.make_sure_hubert_installed()
|
||||||
|
hubert_manager.make_sure_tokenizer_installed()
|
||||||
|
|
||||||
HUBERT_ENABLED = True
|
HUBERT_ENABLED = True
|
||||||
|
print("HuBERT detected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if VERBOSE_DEBUG:
|
||||||
|
print("Error:", e)
|
||||||
HUBERT_ENABLED = False
|
HUBERT_ENABLED = False
|
||||||
|
|
||||||
if BARK_ENABLED:
|
|
||||||
TTSES.append('bark')
|
TTSES.append('bark')
|
||||||
|
|
||||||
def semantic_to_audio_tokens(
|
def semantic_to_audio_tokens(
|
||||||
|
@ -192,7 +215,7 @@ if BARK_ENABLED:
|
||||||
# generate semantic tokens
|
# generate semantic tokens
|
||||||
|
|
||||||
if HUBERT_ENABLED:
|
if HUBERT_ENABLED:
|
||||||
wav = wav.to(device)
|
wav = wav.to(self.device)
|
||||||
|
|
||||||
# Extract discrete codes from EnCodec
|
# Extract discrete codes from EnCodec
|
||||||
with torch.no_grad():
|
with torch.no_grad():
|
||||||
|
@ -202,23 +225,20 @@ if BARK_ENABLED:
|
||||||
# get seconds of audio
|
# get seconds of audio
|
||||||
seconds = wav.shape[-1] / model.sample_rate
|
seconds = wav.shape[-1] / model.sample_rate
|
||||||
|
|
||||||
hubert_manager = HuBERTManager()
|
|
||||||
hubert_manager.make_sure_hubert_installed()
|
|
||||||
hubert_manager.make_sure_tokenizer_installed()
|
|
||||||
|
|
||||||
from hubert.pre_kmeans_hubert import CustomHubert
|
|
||||||
from hubert.customtokenizer import CustomTokenizer
|
|
||||||
|
|
||||||
# Load the HuBERT model
|
# Load the HuBERT model
|
||||||
hubert_model = CustomHubert(checkpoint_path='./models/hubert/hubert.pt').to(device)
|
hubert_model = CustomHubert(checkpoint_path='./data/models/hubert/hubert.pt').to(self.device)
|
||||||
|
|
||||||
# Load the CustomTokenizer model
|
# Load the CustomTokenizer model
|
||||||
tokenizer = CustomTokenizer.load_from_checkpoint('./models/hubert/tokenizer.pth').to(device)
|
tokenizer = CustomTokenizer.load_from_checkpoint('./data/models/hubert/tokenizer.pth').to(self.device)
|
||||||
|
|
||||||
|
semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
|
||||||
|
semantic_tokens = tokenizer.get_token(semantic_vectors)
|
||||||
|
|
||||||
|
# move codes to cpu
|
||||||
|
codes = codes.cpu().numpy()
|
||||||
|
# move semantic tokens to cpu
|
||||||
|
semantic_tokens = semantic_tokens.cpu().numpy()
|
||||||
else:
|
else:
|
||||||
# Load and pre-process the audio waveform
|
|
||||||
model = load_codec_model(use_gpu=True)
|
|
||||||
wav, sr = torchaudio.load(audio_filepath)
|
|
||||||
wav = convert_audio(wav, sr, model.sample_rate, model.channels)
|
|
||||||
wav = wav.unsqueeze(0).to(self.device)
|
wav = wav.unsqueeze(0).to(self.device)
|
||||||
|
|
||||||
# Extract discrete codes from EnCodec
|
# Extract discrete codes from EnCodec
|
||||||
|
@ -1358,6 +1378,10 @@ def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, orig
|
||||||
if hasattr(tts, "loading") and tts.loading:
|
if hasattr(tts, "loading") and tts.loading:
|
||||||
raise Exception("TTS is still initializing...")
|
raise Exception("TTS is still initializing...")
|
||||||
|
|
||||||
|
if args.tts_backend == "bark":
|
||||||
|
tts.create_voice( voice )
|
||||||
|
return
|
||||||
|
|
||||||
if args.autoregressive_model == "auto":
|
if args.autoregressive_model == "auto":
|
||||||
tts.load_autoregressive_model(deduce_autoregressive_model(voice))
|
tts.load_autoregressive_model(deduce_autoregressive_model(voice))
|
||||||
|
|
||||||
|
|
|
@ -169,10 +169,6 @@ def reset_generate_settings_proxy():
|
||||||
return tuple(res)
|
return tuple(res)
|
||||||
|
|
||||||
def compute_latents_proxy(voice, voice_latents_chunks, original_ar, original_diffusion, progress=gr.Progress(track_tqdm=True)):
|
def compute_latents_proxy(voice, voice_latents_chunks, original_ar, original_diffusion, progress=gr.Progress(track_tqdm=True)):
|
||||||
if args.tts_backend == "bark":
|
|
||||||
global tts
|
|
||||||
tts.create_voice( voice )
|
|
||||||
return voice
|
|
||||||
compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, original_ar=original_ar, original_diffusion=original_diffusion )
|
compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, original_ar=original_ar, original_diffusion=original_diffusion )
|
||||||
return voice
|
return voice
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user