diff --git a/tortoise_utils.py b/tortoise_utils.py new file mode 100644 index 0000000..109087e --- /dev/null +++ b/tortoise_utils.py @@ -0,0 +1,70 @@ +import re +from tortoise.api import TextToSpeech +from tortoise.utils.audio import load_voice + +def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]: + # remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes + text = re.sub(r"\n\n+", r"\n", text) + text = re.sub(r"\s+", r" ", text) + text = re.sub(r"[“”]", '"', text) + + # split text into sentences, keep quotes together + sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text) + + # recombine sentences into chunks of desired length + chunks = [] + chunk = "" + for sentence in sentences: + if len(chunk) + len(sentence) > target_len: + chunks.append(chunk) + chunk = "" + chunk += sentence + " " + if len(chunk) > max_len: + chunks.append(chunk) + chunk = "" + if chunk: + chunks.append(chunk) + + # clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks + chunks = [s.strip() for s in chunks] + chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)] + + return chunks + + +def process_textfile(file_path: str) -> list[str]: + with open(file_path, "r", encoding="utf-8") as f: + text = " ".join([l for l in f.readlines()]) + text = clean_text(text) + return text + +def tts(paper_name: str): + # load tts model + # ADD PATH + tts = TextToSpeech( + autoregressive_model_path="./ai-voice-cloning/training/" + ) + voice = "Lex" + voice_samples, conditioning_latents = load_voice( + voice, extra_voice_dirs="./ai-voice-cloning/voices" + ) + + # process text file + texts = process_textfile(f"./llm/scripts/{paper_name}.txt") + + # generate audio for each chunk of text + all_audio_chunks = [] + for i, text in enumerate(texts): + gen = tts.tts( + text=text, + voice=voice, + voice_samples=voice_samples, + conditioning_latents=conditioning_latents, + ) + torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000) + + all_audio_chunks.append(gen) + + # concatenate all audio chunks + full_audio = torch.cat(all_audio_chunks, dim=-1) + torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)