updating utils

adding tortoise utils
adding inference_utils
2023-10-04 16:04:08 +08:00 · 2023-09-10 23:45:45 +08:00 · 2023-09-09 17:46:40 +08:00 · 2023-09-07 19:19:27 +08:00
12 changed files with 159 additions and 62 deletions
--- a/2
+++ b/2
@ -20,7 +20,7 @@ ENV PATH="$HOME/miniconda/bin:$PATH"
 RUN conda init
 RUN conda install python=$PYTHON_VERSION
 RUN python3 -m pip install --upgrade pip
-RUN pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+RUN pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118

 RUN mkdir $HOME/ai-voice-cloning
 WORKDIR $HOME/ai-voice-cloning
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 # AI Voice Cloning

-> **Note** This project is effectively abandonware due to requiring a rewrite. Please use [JarodMica/ai-voice-cloning](https://github.com/JarodMica/ai-voice-cloning).
+> **Note** This project has been in dire need of being rewritten from the ground up for some time. Apologies for any crust from my rather spaghetti code.

 This [repo](https://git.ecker.tech/mrq/ai-voice-cloning)/[rentry](https://rentry.org/AI-Voice-Cloning/) aims to serve as both a foolproof guide for setting up AI voice cloning tools for legitimate, local use on Windows/Linux, as well as a stepping stone for anons that genuinely want to play around with [TorToiSe](https://github.com/neonbjb/tortoise-tts).

@ -16,4 +16,4 @@ Please consult [the wiki](https://git.ecker.tech/mrq/ai-voice-cloning/wiki) for

 ## Bug Reporting

-If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
+If you run into any problems, please refer to the [issues you may encounter](https://git.ecker.tech/mrq/ai-voice-cloning/wiki/Issues) wiki page first.
--- a/init.py
+++ b/init.py
--- a/inference_utils.py
+++ b/inference_utils.py
@ -0,0 +1,71 @@
+import re
+from tortoise.api import TextToSpeech
+from tortoise.utils.audio import load_voice
+
+def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
+    # remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
+    text = re.sub(r"\n\n+", r"\n", text)
+    text = re.sub(r"\s+", r" ", text)
+    text = re.sub(r"[“”]", '"', text)
+
+    # split text into sentences, keep quotes together
+    sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
+
+    # recombine sentences into chunks of desired length
+    chunks = []
+    chunk = ""
+    for sentence in sentences:
+        if len(chunk) + len(sentence) > target_len:
+            chunks.append(chunk)
+            chunk = ""
+        chunk += sentence + " "
+        if len(chunk) > max_len:
+            chunks.append(chunk)
+            chunk = ""
+    if chunk:
+        chunks.append(chunk)
+
+    # clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
+    chunks = [s.strip() for s in chunks]
+    chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
+
+    return chunks
+
+
+def process_textfile(file_path: str) -> list[str]:
+    with open(file_path, "r", encoding="utf-8") as f:
+        text = " ".join([l for l in f.readlines()])
+    text = clean_text(text)
+    return text
+
+def tts(paper_name: str):
+    # load tts model
+    tts = TextToSpeech(
+        autoregressive_model_path="./ai-voice-cloning/training/GlaDOS/finetune/models/5304_gpt.pth"
+    )
+    voice = "GlaDOS"
+    voice_samples, conditioning_latents = load_voice(
+        voice, extra_voice_dirs="./ai-voice-cloning/voices"
+    )
+
+    # process text file
+    texts = process_textfile(f"./llm/scripts/{paper_name}.txt")
+
+    # generate audio for each chunk of text
+    all_audio_chunks = []
+    for i, text in enumerate(texts):
+        gen = tts.tts(
+            text=text,
+            voice=voice,
+            voice_samples=voice_samples,
+            conditioning_latents=conditioning_latents,
+        )
+        torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
+
+        all_audio_chunks.append(gen)
+
+    # concatenate all audio chunks
+    full_audio = torch.cat(all_audio_chunks, dim=-1)
+    torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
+
+print("here")
--- a/modules/tortoise-tts
+++ b/modules/tortoise-tts
@ -1 +1 @@
-Subproject commit bf3b6c87aa825295f64a31d010fd5e896fbcda43
+Subproject commit b10c58436d6871c26485d30b203e6cfdd4167602
--- a/notebook_colab.ipynb
+++ b/notebook_colab.ipynb
@ -38,24 +38,10 @@
            
         ],
         "source":[
-            "!apt install python3.10-venv\n",
+            "!apt install python3.8-venv\n",
            "!git clone https://git.ecker.tech/mrq/ai-voice-cloning/\n",
            "%cd /content/ai-voice-cloning\n",
-            "# get local dependencies\n",
-            "!git submodule init\n",
-            "!git submodule update --remote\n",
-            "# setup venv\n",
-            "!python3 -m venv venv\n",
-            "!source ./venv/bin/activate\n",
-            "!python3 -m pip install --upgrade pip # just to be safe\n",
-            "# CUDA\n",
-            "!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118\n",
-            "# install requirements\n",
-            "!python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements\n",
-            "!python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe\n",
-            "!python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirements, last, because whisperx will break a dependency here\n",
-            "!python3 -m pip install -e ./modules/dlas/ # install DLAS\n",
-            "!python3 -m pip install -r ./requirements.txt # install local requirements"
+            "!./setup-cuda.sh"
         ]
      },
      {
@ -129,8 +115,7 @@
         "cell_type":"code",
         "source":[
            "%cd /content/ai-voice-cloning/\n",
-            "!source ./venv/bin/activate\n",
-            "!python3 ./src/main.py --share"
+            "!./start.sh --share"
         ],
         "metadata":{
            "id":"QRA8jF3cF-YJ"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,9 +1,5 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch>=2.1.0
-torchvision
-torchaudio
+git+https://github.com/openai/whisper.git

-openai-whisper
 more-itertools
 ffmpeg-python
 gradio<=3.23.0
@ -12,6 +8,4 @@ voicefixer
 psutil
 phonemizer
 pydantic==1.10.11
-websockets
-beartype==0.15.0
-pykakasi
+websockets
--- a/setup-cuda.bat
+++ b/setup-cuda.bat
@ -4,7 +4,7 @@ git submodule update --remote
 python -m venv venv
 call .\venv\Scripts\activate.bat
 python -m pip install --upgrade pip
-python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+python -m pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 python -m pip install -r .\modules\tortoise-tts\requirements.txt
 python -m pip install -e .\modules\tortoise-tts\
 python -m pip install -r .\modules\dlas\requirements.txt
--- a/setup-cuda.sh
+++ b/setup-cuda.sh
@ -2,12 +2,8 @@
 # get local dependencies
 git submodule init
 git submodule update --remote
-# setup venv
-python3 -m venv venv
-source ./venv/bin/activate
-python3 -m pip install --upgrade pip # just to be safe
 # CUDA
-pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip3 install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 # install requirements
 python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
 python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
@ -15,6 +11,4 @@ python3 -m pip install -r ./modules/dlas/requirements.txt # instal DLAS requirem
 python3 -m pip install -e ./modules/dlas/ # install DLAS
 python3 -m pip install -r ./requirements.txt # install local requirements

-rm *.bat
-
-deactivate
+rm *.bat
--- a/setup-rocm.sh
+++ b/setup-rocm.sh
@ -7,7 +7,7 @@ python3 -m venv venv
 source ./venv/bin/activate
 python3 -m pip install --upgrade pip # just to be safe
 # ROCM
-pip3 install torch==1.13.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
+pip3 install torch==1.13.1 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/rocm5.2 # 5.4.2 doesn't work for me desu
 # install requirements
 python3 -m pip install -r ./modules/tortoise-tts/requirements.txt # install TorToiSe requirements
 python3 -m pip install -e ./modules/tortoise-tts/ # install TorToiSe
--- a/src/utils.py
+++ b/src/utils.py
@ -68,20 +68,8 @@ BARK_ENABLED = False

 VERBOSE_DEBUG = True

-KKS = None
-PYKAKASI_ENABLED = False
-
 import traceback

-try:
-	import pykakasi
-	KKS = pykakasi.kakasi()
-	PYKAKASI_ENABLED = True
-except Exception as e:
-	#if VERBOSE_DEBUG:
-	#	print(traceback.format_exc())
-	pass
-
 try:
 	from whisper.normalizers.english import EnglishTextNormalizer
 	from whisper.normalizers.basic import BasicTextNormalizer
@ -2677,8 +2665,8 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p


 		culled = len(text) < text_length
-		if not culled and audio_length > 0:
-			culled = duration < audio_length
+		#if not culled and audio_length > 0:
+		#	culled = duration < audio_length

 		line = f'audio/{file}|{phonemes if phonemize and phonemes else text}'

@ -2746,14 +2734,6 @@ def prepare_dataset( voice, use_segments=False, text_length=0, audio_length=0, p
 		phn_file = jobs['phonemize'][0][i]
 		normalized = jobs['phonemize'][1][i]

-		if language == "japanese":
-			language = "ja"
-
-		if language == "ja" and PYKAKASI_ENABLED and KKS is not None:
-			normalized = KKS.convert(normalized)
-			normalized = [ n["hira"] for n in normalized ]
-			normalized = "".join(normalized)
-
 		try:
 			phonemized = valle_phonemize( normalized )
 			open(phn_file, 'w', encoding='utf-8').write(" ".join(phonemized))
--- a/tortoise_utils.py
+++ b/tortoise_utils.py
@ -0,0 +1,73 @@
+import re
+from tortoise.api import TextToSpeech
+from tortoise.utils.audio import load_voice
+
+def clean_text(text: str, target_len: int = 200, max_len: int = 300) -> list[str]:
+    # remove double new line, redundant whitespace, convert non-ascii quotes to ascii quotes
+    text = re.sub(r"\n\n+", r"\n", text)
+    text = re.sub(r"\s+", r" ", text)
+    text = re.sub(r"[“”]", '"', text)
+
+    # split text into sentences, keep quotes together
+    sentences = re.split(r'(?<=[.!?])\s+(?=(?:[^"]*"[^"]*")*[^"]*$)', text)
+
+    # recombine sentences into chunks of desired length
+    chunks = []
+    chunk = ""
+    for sentence in sentences:
+        if len(chunk) + len(sentence) > target_len:
+            chunks.append(chunk)
+            chunk = ""
+        chunk += sentence + " "
+        if len(chunk) > max_len:
+            chunks.append(chunk)
+            chunk = ""
+    if chunk:
+        chunks.append(chunk)
+
+    # clean up chunks, remove leading/trailing whitespace, remove empty/unless chunks
+    chunks = [s.strip() for s in chunks]
+    chunks = [s for s in chunks if s and not re.match(r"^[\s\.,;:!?]*$", s)]
+
+    return chunks
+
+
+def process_textfile(file_path: str) -> list[str]:
+    with open(file_path, "r", encoding="utf-8") as f:
+        text = " ".join([l for l in f.readlines()])
+    text = clean_text(text)
+    return text
+
+def tts(file_path: str):
+    # load tts model
+    # ADD PATH
+    tts = TextToSpeech(
+        autoregressive_model_path="./ai-voice-cloning/training/"
+    )
+    voice = "Lex"
+    voice_samples, conditioning_latents = load_voice(
+        voice, extra_voice_dirs="./ai-voice-cloning/voices"
+    )
+
+    # process text file
+    texts = process_textfile(file_path)
+
+    # generate audio for each chunk of text
+    all_audio_chunks = []
+    for i, text in enumerate(texts):
+        gen = tts.tts(
+            text=text,
+            voice=voice,
+            voice_samples=voice_samples,
+            conditioning_latents=conditioning_latents,
+        )
+        torchaudio.save(f"./audio/raw/{i}.wav", gen.squeeze(0).cpu(), 24000)
+
+        all_audio_chunks.append(gen)
+
+    book_name_ext = os.path.basename(file_path)
+    paper_name = os.path.splitext(book_name_ext)[0]
+
+    # concatenate all audio chunks
+    full_audio = torch.cat(all_audio_chunks, dim=-1)
+    torchaudio.save(f"./audio/raw/{paper_name}.wav", full_audio, 24000)
Author	SHA1	Message	Date
YongeBai	a1d0ea3232	updating utils	2023-10-04 16:04:08 +08:00
YongeBai	02e3a46700	adding tortoise utils	2023-09-10 23:45:45 +08:00
YongeBai	79154ac651	adding inference_utils	2023-09-09 17:46:40 +08:00
YongeBai	6bae8c6a8c	removing venv setup	2023-09-07 19:19:27 +08:00