Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
a1d0ea3232 | |||
02e3a46700 | |||
79154ac651 | |||
6bae8c6a8c |
0
__init__.py
Normal file
0
__init__.py
Normal file
71
inference_utils.py
Normal file
71
inference_utils.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
import re
|
||||
from tortoise.api import TextToSpeech
|
||||
from tortoise.utils.audio import load_voice
|
||||
|
||||
def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
|
||||
# remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
|
||||
text = re.sub(r"\n\n+", r"\n", text)
|
||||
text = re.sub(r"\s+", r" ", text)
|
||||
text = re.sub(r"[“”]", '"', text)
|
||||
|
||||
# split text into sentences, keep quotes together
|
||||
sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
|
||||
|
||||
# recombine sentences into chunks of desired length
|
||||
chunks = []
|
||||
chunk = ""
|
||||
for sentence in sentences:
|
||||
if len(chunk) + len(sentence) > target_len:
|
||||
chunks.append(chunk)
|
||||
chunk = ""
|
||||
chunk += sentence + " "
|
||||
if len(chunk) > max_len:
|
||||
chunks.append(chunk)
|
||||
chunk = ""
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
# clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
|
||||
chunks = [s.strip() for s in chunks]
|
||||
chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def process_textfile(file_path: str) -> list[str]:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
text = " ".join([l for l in f.readlines()])
|
||||
text = clean_text(text)
|
||||
return text
|
||||
|
||||
def tts(paper_name: str):
|
||||
# load tts model
|
||||
tts = TextToSpeech(
|
||||
autoregressive_model_path="./ai-voice-cloning/training/GlaDOS/finetune/models/5304_gpt.pth"
|
||||
)
|
||||
voice = "GlaDOS"
|
||||
voice_samples, conditioning_latents = load_voice(
|
||||
voice, extra_voice_dirs="./ai-voice-cloning/voices"
|
||||
)
|
||||
|
||||
# process text file
|
||||
texts = process_textfile(f"./llm/scripts/{paper_name}.txt")
|
||||
|
||||
# generate audio for each chunk of text
|
||||
all_audio_chunks = []
|
||||
for i, text in enumerate(texts):
|
||||
gen = tts.tts(
|
||||
text=text,
|
||||
voice=voice,
|
||||
voice_samples=voice_samples,
|
||||
conditioning_latents=conditioning_latents,
|
||||
)
|
||||
torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
|
||||
|
||||
all_audio_chunks.append(gen)
|
||||
|
||||
# concatenate all audio chunks
|
||||
full_audio = torch.cat(all_audio_chunks, dim=-1)
|
||||
torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
|
||||
|
||||
print("here")
|
|
@ -2,10 +2,6 @@
|
|||
# get local dependencies
|
||||
git submodule init
|
||||
git submodule update --remote
|
||||
# setup venv
|
||||
python3 -m venv venv
|
||||
source ./venv/bin/activate
|
||||
python3 -m pip install --upgrade pip # just to be safe
|
||||
# CUDA
|
||||
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||
# install requirements
|
||||
|
@ -16,5 +12,3 @@ python3 -m pip install -e ./modules/dlas/ # install DLAS
|
|||
python3 -m pip install -r ./requirements.txt # install local requirements
|
||||
|
||||
rm *.bat
|
||||
|
||||
deactivate
|
73
tortoise_utils.py
Normal file
73
tortoise_utils.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
import re
|
||||
from tortoise.api import TextToSpeech
|
||||
from tortoise.utils.audio import load_voice
|
||||
|
||||
def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
|
||||
# remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
|
||||
text = re.sub(r"\n\n+", r"\n", text)
|
||||
text = re.sub(r"\s+", r" ", text)
|
||||
text = re.sub(r"[“”]", '"', text)
|
||||
|
||||
# split text into sentences, keep quotes together
|
||||
sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
|
||||
|
||||
# recombine sentences into chunks of desired length
|
||||
chunks = []
|
||||
chunk = ""
|
||||
for sentence in sentences:
|
||||
if len(chunk) + len(sentence) > target_len:
|
||||
chunks.append(chunk)
|
||||
chunk = ""
|
||||
chunk += sentence + " "
|
||||
if len(chunk) > max_len:
|
||||
chunks.append(chunk)
|
||||
chunk = ""
|
||||
if chunk:
|
||||
chunks.append(chunk)
|
||||
|
||||
# clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
|
||||
chunks = [s.strip() for s in chunks]
|
||||
chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def process_textfile(file_path: str) -> list[str]:
|
||||
with open(file_path, "r", encoding="utf-8") as f:
|
||||
text = " ".join([l for l in f.readlines()])
|
||||
text = clean_text(text)
|
||||
return text
|
||||
|
||||
def tts(file_path: str):
|
||||
# load tts model
|
||||
# ADD PATH
|
||||
tts = TextToSpeech(
|
||||
autoregressive_model_path="./ai-voice-cloning/training/"
|
||||
)
|
||||
voice = "Lex"
|
||||
voice_samples, conditioning_latents = load_voice(
|
||||
voice, extra_voice_dirs="./ai-voice-cloning/voices"
|
||||
)
|
||||
|
||||
# process text file
|
||||
texts = process_textfile(file_path)
|
||||
|
||||
# generate audio for each chunk of text
|
||||
all_audio_chunks = []
|
||||
for i, text in enumerate(texts):
|
||||
gen = tts.tts(
|
||||
text=text,
|
||||
voice=voice,
|
||||
voice_samples=voice_samples,
|
||||
conditioning_latents=conditioning_latents,
|
||||
)
|
||||
torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
|
||||
|
||||
all_audio_chunks.append(gen)
|
||||
|
||||
book_name_ext = os.path.basename(file_path)
|
||||
paper_name = os.path.splitext(book_name_ext)[0]
|
||||
|
||||
# concatenate all audio chunks
|
||||
full_audio = torch.cat(all_audio_chunks, dim=-1)
|
||||
torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
|
Loading…
Reference in New Issue
Block a user