deduce language per line to allow for a cheap way to allow for cross-lingual switching, kinda

2024-12-07 22:57:29 -06:00 · 2024-12-07 22:57:29 -06:00 · 0c5a458b00
commit 0c5a458b00
parent a032ff588f
1 changed files with 10 additions and 6 deletions
--- a/vall_e/inference.py
+++ b/vall_e/inference.py
@ -351,12 +351,6 @@ class TTS():
 		use_lora = sampling_kwargs.pop("use_lora", None)
 		dtype = sampling_kwargs.pop("dtype", self.dtype)
 		amp = sampling_kwargs.pop("amp", self.amp)
-
-		if language == "auto":
-			language = g2p.detect_language( text )
-
-		if not text_language:
-			text_language = language
 		
 		lines = sentence_split(text, split_by=sampling_kwargs.get("split_text_by", "sentences"))

@ -413,6 +407,8 @@ class TTS():
 		prefix_contexts = []
 		context_history = sampling_kwargs.get("context_history", 0)

+		auto_lang = not language or language == "auto"
+		auto_text_lang = not text_language or text_language == "auto"
 		for line in lines:
 			if out_path is None:
 				output_dir = Path("./data/results/")
@ -420,6 +416,14 @@ class TTS():
 					output_dir.mkdir(parents=True, exist_ok=True)
 				out_path = output_dir / f"{time.time()}.wav"

+			deduced_language = g2p.detect_language( line ) if auto_lang or auto_text_lang else language
+
+			if auto_lang:
+				language = deduced_language
+
+			if auto_text_lang:
+				text_language = deduced_language
+
 			prom = self.encode_audio( references, trim_length=input_prompt_length ) if references else None
 			phns = self.encode_text( line, language=text_language )
 			lang = self.encode_lang( language )