Compare commits

...

4 Commits

Author SHA1 Message Date
a1d0ea3232 updating utils 2023-10-04 16:04:08 +08:00
02e3a46700 adding tortoise utils 2023-09-10 23:45:45 +08:00
79154ac651 adding inference_utils 2023-09-09 17:46:40 +08:00
6bae8c6a8c removing venv setup 2023-09-07 19:19:27 +08:00
4 changed files with 145 additions and 7 deletions

0
__init__.py Normal file
View File

71
inference_utils.py Normal file
View File

@ -0,0 +1,71 @@
import re
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
# remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
text = re.sub(r"\n\n+", r"\n", text)
text = re.sub(r"\s+", r" ", text)
text = re.sub(r"[“”]", '"', text)
# split text into sentences, keep quotes together
sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
# recombine sentences into chunks of desired length
chunks = []
chunk = ""
for sentence in sentences:
if len(chunk) + len(sentence) > target_len:
chunks.append(chunk)
chunk = ""
chunk += sentence + " "
if len(chunk) > max_len:
chunks.append(chunk)
chunk = ""
if chunk:
chunks.append(chunk)
# clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
chunks = [s.strip() for s in chunks]
chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
return chunks
def process_textfile(file_path: str) -> list[str]:
with open(file_path, "r", encoding="utf-8") as f:
text = " ".join([l for l in f.readlines()])
text = clean_text(text)
return text
def tts(paper_name: str):
# load tts model
tts = TextToSpeech(
autoregressive_model_path="./ai-voice-cloning/training/GlaDOS/finetune/models/5304_gpt.pth"
)
voice = "GlaDOS"
voice_samples, conditioning_latents = load_voice(
voice, extra_voice_dirs="./ai-voice-cloning/voices"
)
# process text file
texts = process_textfile(f"./llm/scripts/{paper_name}.txt")
# generate audio for each chunk of text
all_audio_chunks = []
for i, text in enumerate(texts):
gen = tts.tts(
text=text,
voice=voice,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
)
torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
all_audio_chunks.append(gen)
# concatenate all audio chunks
full_audio = torch.cat(all_audio_chunks, dim=-1)
torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
print("here")

View File

@ -2,10 +2,6 @@
# get local dependencies # get local dependencies
git submodule init git submodule init
git submodule update --remote git submodule update --remote
# setup venv
python3 -m venv venv
source ./venv/bin/activate
python3 -m pip install --upgrade pip # just to be safe
# CUDA # CUDA
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118 pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
# install requirements # install requirements
@ -15,6 +11,4 @@ python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirem
python3 -m pip install -e ./modules/dlas/ # install DLAS python3 -m pip install -e ./modules/dlas/ # install DLAS
python3 -m pip install -r ./requirements.txt # install local requirements python3 -m pip install -r ./requirements.txt # install local requirements
rm *.bat rm *.bat
deactivate

73
tortoise_utils.py Normal file
View File

@ -0,0 +1,73 @@
import re
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
# remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
text = re.sub(r"\n\n+", r"\n", text)
text = re.sub(r"\s+", r" ", text)
text = re.sub(r"[“”]", '"', text)
# split text into sentences, keep quotes together
sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
# recombine sentences into chunks of desired length
chunks = []
chunk = ""
for sentence in sentences:
if len(chunk) + len(sentence) > target_len:
chunks.append(chunk)
chunk = ""
chunk += sentence + " "
if len(chunk) > max_len:
chunks.append(chunk)
chunk = ""
if chunk:
chunks.append(chunk)
# clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
chunks = [s.strip() for s in chunks]
chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
return chunks
def process_textfile(file_path: str) -> list[str]:
with open(file_path, "r", encoding="utf-8") as f:
text = " ".join([l for l in f.readlines()])
text = clean_text(text)
return text
def tts(file_path: str):
# load tts model
# ADD PATH
tts = TextToSpeech(
autoregressive_model_path="./ai-voice-cloning/training/"
)
voice = "Lex"
voice_samples, conditioning_latents = load_voice(
voice, extra_voice_dirs="./ai-voice-cloning/voices"
)
# process text file
texts = process_textfile(file_path)
# generate audio for each chunk of text
all_audio_chunks = []
for i, text in enumerate(texts):
gen = tts.tts(
text=text,
voice=voice,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
)
torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
all_audio_chunks.append(gen)
book_name_ext = os.path.basename(file_path)
paper_name = os.path.splitext(book_name_ext)[0]
# concatenate all audio chunks
full_audio = torch.cat(all_audio_chunks, dim=-1)
torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)