Compare commits
4 Commits
Author | SHA1 | Date | |
---|---|---|---|
a1d0ea3232 | |||
02e3a46700 | |||
79154ac651 | |||
6bae8c6a8c |
2
Dockerfile
Executable file → Normal file
2
Dockerfile
Executable file → Normal file
|
@ -20,7 +20,7 @@ ENV PATH="$HOME/miniconda/bin:$PATH"
|
||||||
RUN conda init
|
RUN conda init
|
||||||
RUN conda install python=$PYTHON_VERSION
|
RUN conda install python=$PYTHON_VERSION
|
||||||
RUN python3 -m pip install --upgrade pip
|
RUN python3 -m pip install --upgrade pip
|
||||||
RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
|
|
||||||
RUN mkdir $HOME/ai-voice-cloning
|
RUN mkdir $HOME/ai-voice-cloning
|
||||||
WORKDIR $HOME/ai-voice-cloning
|
WORKDIR $HOME/ai-voice-cloning
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# AI Voice Cloning
|
# AI Voice Cloning
|
||||||
|
|
||||||
> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
|
> **Note** This project has been in dire need of being rewritten from the ground up for some time. Apologies for any crust from my rather spaghetti code.
|
||||||
|
|
||||||
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).
|
||||||
|
|
||||||
|
@ -16,4 +16,4 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for
|
||||||
|
|
||||||
## Bug Reporting
|
## Bug Reporting
|
||||||
|
|
||||||
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
|
If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
|
0
__init__.py
Normal file
0
__init__.py
Normal file
71
inference_utils.py
Normal file
71
inference_utils.py
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
import re
|
||||||
|
from tortoise.api import TextToSpeech
|
||||||
|
from tortoise.utils.audio import load_voice
|
||||||
|
|
||||||
|
def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
|
||||||
|
# remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
|
||||||
|
text = re.sub(r"\n\n+", r"\n", text)
|
||||||
|
text = re.sub(r"\s+", r" ", text)
|
||||||
|
text = re.sub(r"[“”]", '"', text)
|
||||||
|
|
||||||
|
# split text into sentences, keep quotes together
|
||||||
|
sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
|
||||||
|
|
||||||
|
# recombine sentences into chunks of desired length
|
||||||
|
chunks = []
|
||||||
|
chunk = ""
|
||||||
|
for sentence in sentences:
|
||||||
|
if len(chunk) + len(sentence) > target_len:
|
||||||
|
chunks.append(chunk)
|
||||||
|
chunk = ""
|
||||||
|
chunk += sentence + " "
|
||||||
|
if len(chunk) > max_len:
|
||||||
|
chunks.append(chunk)
|
||||||
|
chunk = ""
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
# clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
|
||||||
|
chunks = [s.strip() for s in chunks]
|
||||||
|
chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def process_textfile(file_path: str) -> list[str]:
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
text = " ".join([l for l in f.readlines()])
|
||||||
|
text = clean_text(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def tts(paper_name: str):
|
||||||
|
# load tts model
|
||||||
|
tts = TextToSpeech(
|
||||||
|
autoregressive_model_path="./ai-voice-cloning/training/GlaDOS/finetune/models/5304_gpt.pth"
|
||||||
|
)
|
||||||
|
voice = "GlaDOS"
|
||||||
|
voice_samples, conditioning_latents = load_voice(
|
||||||
|
voice, extra_voice_dirs="./ai-voice-cloning/voices"
|
||||||
|
)
|
||||||
|
|
||||||
|
# process text file
|
||||||
|
texts = process_textfile(f"./llm/scripts/{paper_name}.txt")
|
||||||
|
|
||||||
|
# generate audio for each chunk of text
|
||||||
|
all_audio_chunks = []
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
gen = tts.tts(
|
||||||
|
text=text,
|
||||||
|
voice=voice,
|
||||||
|
voice_samples=voice_samples,
|
||||||
|
conditioning_latents=conditioning_latents,
|
||||||
|
)
|
||||||
|
torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
|
||||||
|
|
||||||
|
all_audio_chunks.append(gen)
|
||||||
|
|
||||||
|
# concatenate all audio chunks
|
||||||
|
full_audio = torch.cat(all_audio_chunks, dim=-1)
|
||||||
|
torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
|
||||||
|
|
||||||
|
print("here")
|
|
@ -1 +1 @@
|
||||||
Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
|
Subproject commit b10c58436d6871c26485d30b203e6cfdd4167602
|
|
@ -38,24 +38,10 @@
|
||||||
|
|
||||||
],
|
],
|
||||||
"source":[
|
"source":[
|
||||||
"!apt install python3.10-venv\n",
|
"!apt install python3.8-venv\n",
|
||||||
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
"!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
|
||||||
"%cd /content/ai-voice-cloning\n",
|
"%cd /content/ai-voice-cloning\n",
|
||||||
"# get local dependencies\n",
|
"!./setup-cuda.sh"
|
||||||
"!git submodule init\n",
|
|
||||||
"!git submodule update --remote\n",
|
|
||||||
"# setup venv\n",
|
|
||||||
"!python3 -m venv venv\n",
|
|
||||||
"!source ./venv/bin/activate\n",
|
|
||||||
"!python3 -m pip install --upgrade pip # just to be safe\n",
|
|
||||||
"# CUDA\n",
|
|
||||||
"!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
|
|
||||||
"# install requirements\n",
|
|
||||||
"!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
|
|
||||||
"!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
|
|
||||||
"!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
|
|
||||||
"!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
|
|
||||||
"!python3 -m pip install -r ./requirements.txt # install local requirements"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -129,8 +115,7 @@
|
||||||
"cell_type":"code",
|
"cell_type":"code",
|
||||||
"source":[
|
"source":[
|
||||||
"%cd /content/ai-voice-cloning/\n",
|
"%cd /content/ai-voice-cloning/\n",
|
||||||
"!source ./venv/bin/activate\n",
|
"!./start.sh --share"
|
||||||
"!python3 ./src/main.py --share"
|
|
||||||
],
|
],
|
||||||
"metadata":{
|
"metadata":{
|
||||||
"id":"QRA8jF3cF-YJ"
|
"id":"QRA8jF3cF-YJ"
|
||||||
|
|
|
@ -1,9 +1,5 @@
|
||||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
git+https://github.com/openai/whisper.git
|
||||||
torch>=2.1.0
|
|
||||||
torchvision
|
|
||||||
torchaudio
|
|
||||||
|
|
||||||
openai-whisper
|
|
||||||
more-itertools
|
more-itertools
|
||||||
ffmpeg-python
|
ffmpeg-python
|
||||||
gradio<=3.23.0
|
gradio<=3.23.0
|
||||||
|
@ -12,6 +8,4 @@ voicefixer
|
||||||
psutil
|
psutil
|
||||||
phonemizer
|
phonemizer
|
||||||
pydantic==1.10.11
|
pydantic==1.10.11
|
||||||
websockets
|
websockets
|
||||||
beartype==0.15.0
|
|
||||||
pykakasi
|
|
|
@ -4,7 +4,7 @@ git submodule update --remote
|
||||||
python -m venv venv
|
python -m venv venv
|
||||||
call .\venv\Scripts\activate.bat
|
call .\venv\Scripts\activate.bat
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
python -m pip install -r .\modules\tortoise-tts\requirements.txt
|
python -m pip install -r .\modules\tortoise-tts\requirements.txt
|
||||||
python -m pip install -e .\modules\tortoise-tts\
|
python -m pip install -e .\modules\tortoise-tts\
|
||||||
python -m pip install -r .\modules\dlas\requirements.txt
|
python -m pip install -r .\modules\dlas\requirements.txt
|
||||||
|
|
|
@ -2,12 +2,8 @@
|
||||||
# get local dependencies
|
# get local dependencies
|
||||||
git submodule init
|
git submodule init
|
||||||
git submodule update --remote
|
git submodule update --remote
|
||||||
# setup venv
|
|
||||||
python3 -m venv venv
|
|
||||||
source ./venv/bin/activate
|
|
||||||
python3 -m pip install --upgrade pip # just to be safe
|
|
||||||
# CUDA
|
# CUDA
|
||||||
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
|
||||||
# install requirements
|
# install requirements
|
||||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||||
|
@ -15,6 +11,4 @@ python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirem
|
||||||
python3 -m pip install -e ./modules/dlas/ # install DLAS
|
python3 -m pip install -e ./modules/dlas/ # install DLAS
|
||||||
python3 -m pip install -r ./requirements.txt # install local requirements
|
python3 -m pip install -r ./requirements.txt # install local requirements
|
||||||
|
|
||||||
rm *.bat
|
rm *.bat
|
||||||
|
|
||||||
deactivate
|
|
|
@ -7,7 +7,7 @@ python3 -m venv venv
|
||||||
source ./venv/bin/activate
|
source ./venv/bin/activate
|
||||||
python3 -m pip install --upgrade pip # just to be safe
|
python3 -m pip install --upgrade pip # just to be safe
|
||||||
# ROCM
|
# ROCM
|
||||||
pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
|
||||||
# install requirements
|
# install requirements
|
||||||
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
|
||||||
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
|
||||||
|
|
24
src/utils.py
24
src/utils.py
|
@ -68,20 +68,8 @@ BARK_ENABLED = False
|
||||||
|
|
||||||
VERBOSE_DEBUG = True
|
VERBOSE_DEBUG = True
|
||||||
|
|
||||||
KKS = None
|
|
||||||
PYKAKASI_ENABLED = False
|
|
||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
try:
|
|
||||||
import pykakasi
|
|
||||||
KKS = pykakasi.kakasi()
|
|
||||||
PYKAKASI_ENABLED = True
|
|
||||||
except Exception as e:
|
|
||||||
#if VERBOSE_DEBUG:
|
|
||||||
# print(traceback.format_exc())
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from whisper.normalizers.english import EnglishTextNormalizer
|
from whisper.normalizers.english import EnglishTextNormalizer
|
||||||
from whisper.normalizers.basic import BasicTextNormalizer
|
from whisper.normalizers.basic import BasicTextNormalizer
|
||||||
|
@ -2677,8 +2665,8 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
|
|
||||||
|
|
||||||
culled = len(text) < text_length
|
culled = len(text) < text_length
|
||||||
if not culled and audio_length > 0:
|
#if not culled and audio_length > 0:
|
||||||
culled = duration < audio_length
|
# culled = duration < audio_length
|
||||||
|
|
||||||
line = f'audio/{file}|{phonemes if phonemize and phonemes else text}'
|
line = f'audio/{file}|{phonemes if phonemize and phonemes else text}'
|
||||||
|
|
||||||
|
@ -2746,14 +2734,6 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
|
||||||
phn_file = jobs['phonemize'][0][i]
|
phn_file = jobs['phonemize'][0][i]
|
||||||
normalized = jobs['phonemize'][1][i]
|
normalized = jobs['phonemize'][1][i]
|
||||||
|
|
||||||
if language == "japanese":
|
|
||||||
language = "ja"
|
|
||||||
|
|
||||||
if language == "ja" and PYKAKASI_ENABLED and KKS is not None:
|
|
||||||
normalized = KKS.convert(normalized)
|
|
||||||
normalized = [ n["hira"] for n in normalized ]
|
|
||||||
normalized = "".join(normalized)
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
phonemized = valle_phonemize( normalized )
|
phonemized = valle_phonemize( normalized )
|
||||||
open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized))
|
open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized))
|
||||||
|
|
73
tortoise_utils.py
Normal file
73
tortoise_utils.py
Normal file
|
@ -0,0 +1,73 @@
|
||||||
|
import re
|
||||||
|
from tortoise.api import TextToSpeech
|
||||||
|
from tortoise.utils.audio import load_voice
|
||||||
|
|
||||||
|
def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
|
||||||
|
# remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
|
||||||
|
text = re.sub(r"\n\n+", r"\n", text)
|
||||||
|
text = re.sub(r"\s+", r" ", text)
|
||||||
|
text = re.sub(r"[“”]", '"', text)
|
||||||
|
|
||||||
|
# split text into sentences, keep quotes together
|
||||||
|
sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
|
||||||
|
|
||||||
|
# recombine sentences into chunks of desired length
|
||||||
|
chunks = []
|
||||||
|
chunk = ""
|
||||||
|
for sentence in sentences:
|
||||||
|
if len(chunk) + len(sentence) > target_len:
|
||||||
|
chunks.append(chunk)
|
||||||
|
chunk = ""
|
||||||
|
chunk += sentence + " "
|
||||||
|
if len(chunk) > max_len:
|
||||||
|
chunks.append(chunk)
|
||||||
|
chunk = ""
|
||||||
|
if chunk:
|
||||||
|
chunks.append(chunk)
|
||||||
|
|
||||||
|
# clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
|
||||||
|
chunks = [s.strip() for s in chunks]
|
||||||
|
chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def process_textfile(file_path: str) -> list[str]:
|
||||||
|
with open(file_path, "r", encoding="utf-8") as f:
|
||||||
|
text = " ".join([l for l in f.readlines()])
|
||||||
|
text = clean_text(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def tts(file_path: str):
|
||||||
|
# load tts model
|
||||||
|
# ADD PATH
|
||||||
|
tts = TextToSpeech(
|
||||||
|
autoregressive_model_path="./ai-voice-cloning/training/"
|
||||||
|
)
|
||||||
|
voice = "Lex"
|
||||||
|
voice_samples, conditioning_latents = load_voice(
|
||||||
|
voice, extra_voice_dirs="./ai-voice-cloning/voices"
|
||||||
|
)
|
||||||
|
|
||||||
|
# process text file
|
||||||
|
texts = process_textfile(file_path)
|
||||||
|
|
||||||
|
# generate audio for each chunk of text
|
||||||
|
all_audio_chunks = []
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
gen = tts.tts(
|
||||||
|
text=text,
|
||||||
|
voice=voice,
|
||||||
|
voice_samples=voice_samples,
|
||||||
|
conditioning_latents=conditioning_latents,
|
||||||
|
)
|
||||||
|
torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
|
||||||
|
|
||||||
|
all_audio_chunks.append(gen)
|
||||||
|
|
||||||
|
book_name_ext = os.path.basename(file_path)
|
||||||
|
paper_name = os.path.splitext(book_name_ext)[0]
|
||||||
|
|
||||||
|
# concatenate all audio chunks
|
||||||
|
full_audio = torch.cat(all_audio_chunks, dim=-1)
|
||||||
|
torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
|
Loading…
Reference in New Issue
Block a user