2023-08-02 21:53:35 +00:00
|
|
|
|
import argparse
|
|
|
|
|
import random
|
|
|
|
|
import string
|
|
|
|
|
import torch
|
|
|
|
|
|
|
|
|
|
from functools import cache
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from phonemizer import phonemize
|
|
|
|
|
from phonemizer.backend import BACKENDS
|
|
|
|
|
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
2024-07-22 04:21:37 +00:00
|
|
|
|
try:
|
|
|
|
|
import pykakasi
|
|
|
|
|
except Exception as e:
|
2024-12-08 04:34:25 +00:00
|
|
|
|
pykakasi = None
|
|
|
|
|
print(f'Error while importing pykakasi: {str(e)}')
|
2024-07-22 04:21:37 +00:00
|
|
|
|
pass
|
|
|
|
|
|
2024-12-08 04:34:25 +00:00
|
|
|
|
try:
|
|
|
|
|
import langdetect
|
|
|
|
|
except Exception as e:
|
|
|
|
|
langdetect = None
|
|
|
|
|
print(f'Error while importing langdetect: {str(e)}')
|
|
|
|
|
|
|
|
|
|
def detect_language( text ):
|
|
|
|
|
if langdetect is None:
|
|
|
|
|
raise Exception('langdetect is not installed.')
|
|
|
|
|
return langdetect.detect( text )
|
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
|
def _get_graphs(path):
|
|
|
|
|
with open(path, "r") as f:
|
|
|
|
|
graphs = f.read()
|
|
|
|
|
return graphs
|
|
|
|
|
|
2024-12-09 20:26:19 +00:00
|
|
|
|
def coerce_to_hiragana( runes, sep="" ):
|
2024-12-08 04:34:25 +00:00
|
|
|
|
if pykakasi is None:
|
|
|
|
|
raise Exception('pykakasi is not installed.')
|
|
|
|
|
|
2024-07-22 04:21:37 +00:00
|
|
|
|
kks = pykakasi.kakasi()
|
|
|
|
|
result = kks.convert( runes )
|
|
|
|
|
return sep.join([ res['hira'] for res in result ])
|
|
|
|
|
|
2024-09-13 17:53:36 +00:00
|
|
|
|
def coerce_language( lang ):
|
2024-12-09 20:26:19 +00:00
|
|
|
|
# bottle of water vs bo'oh'o'wa'er
|
2024-09-13 17:53:36 +00:00
|
|
|
|
if lang == "en":
|
|
|
|
|
lang = "en-us"
|
2024-12-09 20:26:19 +00:00
|
|
|
|
# quebec probably
|
2024-09-13 17:53:36 +00:00
|
|
|
|
if lang == "fr":
|
|
|
|
|
return "fr-fr"
|
2024-12-09 20:26:19 +00:00
|
|
|
|
# phonemizer/espeak used to have zh refer to mandarin, but was renamed to cmn
|
|
|
|
|
# cmn outputs cringe, but not cmn-latn-pinyin
|
2024-12-15 23:01:14 +00:00
|
|
|
|
# also just coerces any of the dialects into this (to avoid crimes)
|
|
|
|
|
if lang[:2] == "zh":
|
2024-12-09 20:26:19 +00:00
|
|
|
|
return "cmn-latn-pinyin"
|
|
|
|
|
"""
|
|
|
|
|
things to consider in the future
|
|
|
|
|
en-uk or en-gb
|
|
|
|
|
es-la vs es-es
|
|
|
|
|
pt-br vs pt-pt
|
|
|
|
|
"""
|
2024-09-13 17:53:36 +00:00
|
|
|
|
return lang
|
|
|
|
|
|
2023-08-02 21:53:35 +00:00
|
|
|
|
cached_backends = {}
|
2024-07-22 05:30:40 +00:00
|
|
|
|
def _get_backend( language="en-us", backend="espeak", punctuation=True, stress=True, strip=True ):
|
2023-08-02 21:53:35 +00:00
|
|
|
|
key = f'{language}_{backend}'
|
|
|
|
|
if key in cached_backends:
|
|
|
|
|
return cached_backends[key]
|
|
|
|
|
|
|
|
|
|
if backend == 'espeak':
|
2024-07-22 05:30:40 +00:00
|
|
|
|
phonemizer = BACKENDS[backend]( language, preserve_punctuation=punctuation, with_stress=stress)
|
2023-08-02 21:53:35 +00:00
|
|
|
|
elif backend == 'espeak-mbrola':
|
|
|
|
|
phonemizer = BACKENDS[backend]( language )
|
|
|
|
|
else:
|
2024-07-22 05:30:40 +00:00
|
|
|
|
phonemizer = BACKENDS[backend]( language, preserve_punctuation=punctuation )
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
|
|
|
|
cached_backends[key] = phonemizer
|
|
|
|
|
return phonemizer
|
|
|
|
|
|
|
|
|
|
|
2024-12-08 04:34:25 +00:00
|
|
|
|
def encode(text: str, language="auto", backend="auto", punctuation=True, stress=True, strip=True) -> list[str]:
|
|
|
|
|
if language == "auto":
|
|
|
|
|
language = detect_language( text )
|
|
|
|
|
|
2024-09-13 17:53:36 +00:00
|
|
|
|
language = coerce_language( language )
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
2024-12-09 20:26:19 +00:00
|
|
|
|
#
|
|
|
|
|
if backend == "auto":
|
|
|
|
|
# Convert to hiragana, as espeak does not like kanji
|
|
|
|
|
if language[:2] == "ja":
|
|
|
|
|
text = coerce_to_hiragana( text )
|
|
|
|
|
|
|
|
|
|
# "zh" => "cmn-latn-pinyin"
|
|
|
|
|
elif language == "zh":
|
|
|
|
|
language = "cmn-latn-pinyin"
|
|
|
|
|
|
2024-07-22 04:21:37 +00:00
|
|
|
|
|
2023-08-27 00:53:23 +00:00
|
|
|
|
if not backend or backend == "auto":
|
|
|
|
|
backend = "espeak" # if language[:2] != "en" else "festival"
|
|
|
|
|
|
2024-07-22 05:30:40 +00:00
|
|
|
|
backend = _get_backend(language=language, backend=backend, stress=stress, strip=strip, punctuation=punctuation)
|
2023-08-02 21:53:35 +00:00
|
|
|
|
if backend is not None:
|
2024-12-09 20:26:19 +00:00
|
|
|
|
phonemes = backend.phonemize( [ text ], strip=strip )
|
2023-08-02 21:53:35 +00:00
|
|
|
|
else:
|
2024-12-09 20:26:19 +00:00
|
|
|
|
phonemes = phonemize( [ text ], language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress )
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
2024-12-09 20:26:19 +00:00
|
|
|
|
if not len(phonemes):
|
2024-08-07 01:23:33 +00:00
|
|
|
|
raise Exception(f"Failed to phonemize, received empty string: {text}")
|
|
|
|
|
|
2024-12-09 20:26:19 +00:00
|
|
|
|
phonemes = phonemes[0]
|
|
|
|
|
|
|
|
|
|
# remap tones
|
|
|
|
|
# technically they can be kept in place and just update the tokenizer, but this would be a bit confusing
|
|
|
|
|
if language == "cmn-latn-pinyin":
|
|
|
|
|
tones = {
|
|
|
|
|
"1": "ˇ",
|
|
|
|
|
"2": "ˉ",
|
|
|
|
|
"3": "ˊ",
|
|
|
|
|
"4": "ˋ",
|
|
|
|
|
"5": "_",
|
|
|
|
|
}
|
|
|
|
|
for k, v in tones.items():
|
|
|
|
|
phonemes = phonemes.replace(k, v)
|
|
|
|
|
|
|
|
|
|
return phonemes
|
2024-08-07 01:23:33 +00:00
|
|
|
|
|
|
|
|
|
# Helper function to debug phonemizer
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
|
|
|
|
|
|
parser.add_argument("string", type=str)
|
|
|
|
|
parser.add_argument("--language", type=str, default="en-us")
|
|
|
|
|
parser.add_argument("--backend", type=str, default="auto")
|
|
|
|
|
parser.add_argument("--no-punctuation", action="store_true")
|
|
|
|
|
parser.add_argument("--no-stress", action="store_true")
|
|
|
|
|
parser.add_argument("--no-strip", action="store_true")
|
|
|
|
|
|
|
|
|
|
args = parser.parse_args()
|
2023-08-02 21:53:35 +00:00
|
|
|
|
|
2024-08-07 01:23:33 +00:00
|
|
|
|
phonemes = encode( args.string, language=args.language, backend=args.backend, punctuation=not args.no_punctuation, stress=not args.no_stress, strip=not args.no_strip )
|
|
|
|
|
print( phonemes )
|