uses gitmylo/bark-voice-cloning-HuBERT-quantizer for creating custom voices (it slightly works better over the base method, but still not very good desu)

2023-07-03 02:46:10 +00:00 · 2023-07-03 02:46:10 +00:00 · 6c3f48efba
commit 6c3f48efba
parent 547e1d1277
2 changed files with 47 additions and 27 deletions
--- a/src/utils.py
+++ b/src/utils.py
@ -42,9 +42,6 @@ from tortoise.utils.audio import load_audio, load_voice, load_voices, get_voice_
 from tortoise.utils.text import split_and_recombine_text
 from tortoise.utils.device import get_device_name, set_device_name, get_device_count, get_device_vram, get_device_batch_size, do_gc

-from whisper.normalizers.english import EnglishTextNormalizer
-from whisper.normalizers.basic import BasicTextNormalizer
-from whisper.tokenizer import LANGUAGES 

 MODELS['dvae.pth'] = "https://huggingface.co/jbetker/tortoise-tts-v2/resolve/3704aea61678e7e468a06d8eea121dba368a798e/.models/dvae.pth"

@ -68,6 +65,19 @@ MAX_TRAINING_DURATION = 11.6097505669
 VALLE_ENABLED = False
 BARK_ENABLED = False

+VERBOSE_DEBUG = True
+
+try:
+	from whisper.normalizers.english import EnglishTextNormalizer
+	from whisper.normalizers.basic import BasicTextNormalizer
+	from whisper.tokenizer import LANGUAGES 
+
+	print("Whisper detected")
+except Exception as e:
+	if VERBOSE_DEBUG:
+		print("Error:", e)
+	pass
+
 try:
 	from vall_e.emb.qnt import encode as valle_quantize
 	from vall_e.emb.g2p import encode as valle_phonemize
@ -76,10 +86,11 @@ try:

 	import soundfile

+	print("VALL-E detected")
 	VALLE_ENABLED = True
 except Exception as e:
-	if False: # args.tts_backend == "vall-e":
-		raise e
+	if VERBOSE_DEBUG:
+		print("Error:", e)
 	pass

 if VALLE_ENABLED:
@ -93,27 +104,39 @@ try:

 	from scipy.io.wavfile import write as write_wav

+	print("Bark detected")
 	BARK_ENABLED = True
 except Exception as e:
-	if False: # args.tts_backend == "bark":
-		raise e
+	if VERBOSE_DEBUG:
+		print("Error:", e)
 	pass

 if BARK_ENABLED:
 	try:
 		from vocos import Vocos
 		VOCOS_ENABLED = True
+		print("Vocos detected")
 	except Exception as e:
+		if VERBOSE_DEBUG:
+			print("Error:", e)
 		VOCOS_ENABLED = False

 	try:
 		from hubert.hubert_manager import HuBERTManager
+		from hubert.pre_kmeans_hubert import CustomHubert
+		from hubert.customtokenizer import CustomTokenizer
+
+		hubert_manager = HuBERTManager()
+		hubert_manager.make_sure_hubert_installed()
+		hubert_manager.make_sure_tokenizer_installed()

 		HUBERT_ENABLED = True
+		print("HuBERT detected")
 	except Exception as e:
+		if VERBOSE_DEBUG:
+			print("Error:", e)
 		HUBERT_ENABLED = False

-if BARK_ENABLED:
 	TTSES.append('bark')

 	def semantic_to_audio_tokens(
@ -192,7 +215,7 @@ if BARK_ENABLED:
 			# generate semantic tokens

 			if HUBERT_ENABLED:
-				wav = wav.to(device)
+				wav = wav.to(self.device)

 				# Extract discrete codes from EnCodec
 				with torch.no_grad():
@ -202,23 +225,20 @@ if BARK_ENABLED:
 				# get seconds of audio
 				seconds = wav.shape[-1] / model.sample_rate

-				hubert_manager = HuBERTManager()
-				hubert_manager.make_sure_hubert_installed()
-				hubert_manager.make_sure_tokenizer_installed()
-
-				from hubert.pre_kmeans_hubert import CustomHubert
-				from hubert.customtokenizer import CustomTokenizer
-
 				# Load the HuBERT model
-				hubert_model = CustomHubert(checkpoint_path='./models/hubert/hubert.pt').to(device)
+				hubert_model = CustomHubert(checkpoint_path='./data/models/hubert/hubert.pt').to(self.device)

 				# Load the CustomTokenizer model
-				tokenizer = CustomTokenizer.load_from_checkpoint('./models/hubert/tokenizer.pth').to(device)
+				tokenizer = CustomTokenizer.load_from_checkpoint('./data/models/hubert/tokenizer.pth').to(self.device)
+
+				semantic_vectors = hubert_model.forward(wav, input_sample_hz=model.sample_rate)
+				semantic_tokens = tokenizer.get_token(semantic_vectors)
+
+				# move codes to cpu
+				codes = codes.cpu().numpy()
+				# move semantic tokens to cpu
+				semantic_tokens = semantic_tokens.cpu().numpy()
 			else:
-				# Load and pre-process the audio waveform
-				model = load_codec_model(use_gpu=True)
-				wav, sr = torchaudio.load(audio_filepath)
-				wav = convert_audio(wav, sr, model.sample_rate, model.channels)
 				wav = wav.unsqueeze(0).to(self.device)

 				# Extract discrete codes from EnCodec
@ -1358,6 +1378,10 @@ def compute_latents(voice=None, voice_samples=None, voice_latents_chunks=0, orig
 	if hasattr(tts, "loading") and tts.loading:
 		raise Exception("TTS is still initializing...")

+	if args.tts_backend == "bark":
+		tts.create_voice( voice )
+		return
+
 	if args.autoregressive_model == "auto":
 		tts.load_autoregressive_model(deduce_autoregressive_model(voice))

--- a/src/webui.py
+++ b/src/webui.py
@ -169,10 +169,6 @@ def reset_generate_settings_proxy():
 	return tuple(res)

 def compute_latents_proxy(voice, voice_latents_chunks, original_ar, original_diffusion, progress=gr.Progress(track_tqdm=True)):
-	if args.tts_backend == "bark":
-		global tts
-		tts.create_voice( voice )
-		return voice
 	compute_latents( voice=voice, voice_latents_chunks=voice_latents_chunks, original_ar=original_ar, original_diffusion=original_diffusion )
 	return voice