chinese (mandarin?) support added (I guess I don't need pinyin, but tone markers are handled), korean validated, vocab adjusted
This commit is contained in:
parent
3ef8894290
commit
a6c745bafb
|
@ -129,7 +129,18 @@
|
|||
",": 9,
|
||||
"-": 10,
|
||||
".": 11,
|
||||
"1": 12,
|
||||
|
||||
"1": 211,
|
||||
"—": 10,
|
||||
"“": 6,
|
||||
"”": 81,
|
||||
|
||||
"ˇ": 6,
|
||||
"ˉ": 12,
|
||||
"ˊ": 79,
|
||||
"ˋ": 80,
|
||||
"_": 81,
|
||||
|
||||
":": 13,
|
||||
";": 14,
|
||||
"?": 15,
|
||||
|
@ -212,9 +223,7 @@
|
|||
"̩": 76,
|
||||
"θ": 77,
|
||||
"ᵻ": 78,
|
||||
"—": 79,
|
||||
"“": 80,
|
||||
"”": 81,
|
||||
|
||||
"…": 82,
|
||||
"ˈɛ": 83,
|
||||
"iː": 84,
|
||||
|
|
|
@ -19,6 +19,20 @@ Unlike the paper, this VALL-E aims to:
|
|||
|
||||
However, at this point and time, the implementation is rather divorced from VALL-E and its derivating papers, but the core principle is still followed.
|
||||
|
||||
## Model Specifications
|
||||
|
||||
The reference model (`ar+nar-llama-8`/`ar+nar-len-llama-8`):
|
||||
* boasts 220M parameters
|
||||
* supports English, German, French, and Japanese
|
||||
* support for Korean and Chinese (Mandarin?) soon™
|
||||
* has several modalities of inferencing:
|
||||
* the primary audio level (RVQ level 0) can be inferenced both autoregressively (`AR`) or non-autoregressively (`NAR-len`)
|
||||
* pure-NAR can yield faster-than-realtime output
|
||||
* supports predicting the duration of an input
|
||||
* supports Speech-to-Text (although it's a second-class feature)
|
||||
* additional tasks such as noise reduction, speech removal, editing, and voice conversion eventually™ (just need to train on it)
|
||||
* trained on `?` samples / `?` hours of EnCodec-quantized audio at 24KHz
|
||||
|
||||
## To-Do
|
||||
|
||||
* [x] train and release a serviceable model for finetuning against.
|
||||
|
|
13
docs/emb.md
13
docs/emb.md
|
@ -9,15 +9,18 @@ The `emb` name is a relic of the original implementation used.
|
|||
This script handles taking text of a given language, and phonemizing into IPAs.
|
||||
* This is mainly an abstraction to `phonemizer`.
|
||||
|
||||
For Japanese, text is coerced through `pykakasi` into kana, then phonemized, as `phonemizer` does not like kanji.
|
||||
Some additional languages receive preprocessing/postprocessing:
|
||||
* For Japanese, text is coerced through `pykakasi` into hiragana, then phonemized, as `phonemizer` does not like kanji.
|
||||
* For Chinese (Mandarin), the tone markers are replaced with equivalent tone markers to differentiate between being numbered.
|
||||
|
||||
By default, `espeak` is used as the backend, but other *backends* can be passed through `encode`.
|
||||
|
||||
By default, punctuation, stress markers, and stripping are enabled by default, but *can* be disabled.
|
||||
By default:
|
||||
* `espeak` is used as the backend, but other *backends* can be passed through `encode`.
|
||||
* punctuation, stress markers, and stripping are enabled by default, but *can* be disabled.
|
||||
* language for a given text is automatically deduced with `langdetect`, if language is not provided (or `auto` is passed).
|
||||
|
||||
To avoid memory leaking through `phonemizer`, backends and instances are cached for further reuse.
|
||||
|
||||
The language for a given text can be automatically deduced with `langdetect` by passing `auto` as a language.
|
||||
Unfortunately, there are some quirks with this method, as contextually things like "read" and "lead" do not rhyme with "said".
|
||||
|
||||
### Text Tokens
|
||||
|
||||
|
|
|
@ -521,13 +521,19 @@ def get_lang_symmap():
|
|||
"ja": 1,
|
||||
"de": 2,
|
||||
"fr": 3,
|
||||
"zh": 4, # mandarin I presume
|
||||
"ko": 5,
|
||||
}
|
||||
|
||||
def get_tone_symmap():
|
||||
return {
|
||||
"neutral": 0,
|
||||
# could use 4 instead of 8 basic emotions
|
||||
# "joy": 1,
|
||||
# "fear": 2,
|
||||
# "surprise": 3,
|
||||
# "anger": 4,
|
||||
}
|
||||
return symmap
|
||||
|
||||
def get_task_symmap():
|
||||
return {
|
||||
|
|
|
@ -35,7 +35,7 @@ def _get_graphs(path):
|
|||
return graphs
|
||||
|
||||
@cache
|
||||
def romanize( runes, sep="" ):
|
||||
def coerce_to_hiragana( runes, sep="" ):
|
||||
if pykakasi is None:
|
||||
raise Exception('pykakasi is not installed.')
|
||||
|
||||
|
@ -43,13 +43,23 @@ def romanize( runes, sep="" ):
|
|||
result = kks.convert( runes )
|
||||
return sep.join([ res['hira'] for res in result ])
|
||||
|
||||
# to-do: fill out this table
|
||||
# although desu the only thing that might be needed are en-uk/en-gb, es-la, pt-br, and pt-pt
|
||||
def coerce_language( lang ):
|
||||
# bottle of water vs bo'oh'o'wa'er
|
||||
if lang == "en":
|
||||
lang = "en-us"
|
||||
# quebec probably
|
||||
if lang == "fr":
|
||||
return "fr-fr"
|
||||
# phonemizer/espeak used to have zh refer to mandarin, but was renamed to cmn
|
||||
# cmn outputs cringe, but not cmn-latn-pinyin
|
||||
if lang == "zh":
|
||||
return "cmn-latn-pinyin"
|
||||
"""
|
||||
things to consider in the future
|
||||
en-uk or en-gb
|
||||
es-la vs es-es
|
||||
pt-br vs pt-pt
|
||||
"""
|
||||
return lang
|
||||
|
||||
cached_backends = {}
|
||||
|
@ -75,23 +85,45 @@ def encode(text: str, language="auto", backend="auto", punctuation=True, stress=
|
|||
|
||||
language = coerce_language( language )
|
||||
|
||||
# Convert to kana because espeak does not like kanji...
|
||||
if language[:2] == "ja" and backend == "auto":
|
||||
text = romanize( text )
|
||||
#
|
||||
if backend == "auto":
|
||||
# Convert to hiragana, as espeak does not like kanji
|
||||
if language[:2] == "ja":
|
||||
text = coerce_to_hiragana( text )
|
||||
|
||||
# "zh" => "cmn-latn-pinyin"
|
||||
elif language == "zh":
|
||||
language = "cmn-latn-pinyin"
|
||||
|
||||
|
||||
if not backend or backend == "auto":
|
||||
backend = "espeak" # if language[:2] != "en" else "festival"
|
||||
|
||||
backend = _get_backend(language=language, backend=backend, stress=stress, strip=strip, punctuation=punctuation)
|
||||
if backend is not None:
|
||||
tokens = backend.phonemize( [ text ], strip=strip )
|
||||
phonemes = backend.phonemize( [ text ], strip=strip )
|
||||
else:
|
||||
tokens = phonemize( [ text ], language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress )
|
||||
phonemes = phonemize( [ text ], language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress )
|
||||
|
||||
if not len(tokens):
|
||||
if not len(phonemes):
|
||||
raise Exception(f"Failed to phonemize, received empty string: {text}")
|
||||
|
||||
return tokens[0]
|
||||
phonemes = phonemes[0]
|
||||
|
||||
# remap tones
|
||||
# technically they can be kept in place and just update the tokenizer, but this would be a bit confusing
|
||||
if language == "cmn-latn-pinyin":
|
||||
tones = {
|
||||
"1": "ˇ",
|
||||
"2": "ˉ",
|
||||
"3": "ˊ",
|
||||
"4": "ˋ",
|
||||
"5": "_",
|
||||
}
|
||||
for k, v in tones.items():
|
||||
phonemes = phonemes.replace(k, v)
|
||||
|
||||
return phonemes
|
||||
|
||||
# Helper function to debug phonemizer
|
||||
if __name__ == "__main__":
|
||||
|
|
Loading…
Reference in New Issue
Block a user