chinese (mandarin?) support added (I guess I don't need pinyin, but tone markers are handled), korean validated, vocab adjusted
This commit is contained in:
parent
3ef8894290
commit
a6c745bafb
|
@ -129,7 +129,18 @@
|
||||||
",": 9,
|
",": 9,
|
||||||
"-": 10,
|
"-": 10,
|
||||||
".": 11,
|
".": 11,
|
||||||
"1": 12,
|
|
||||||
|
"1": 211,
|
||||||
|
"—": 10,
|
||||||
|
"“": 6,
|
||||||
|
"”": 81,
|
||||||
|
|
||||||
|
"ˇ": 6,
|
||||||
|
"ˉ": 12,
|
||||||
|
"ˊ": 79,
|
||||||
|
"ˋ": 80,
|
||||||
|
"_": 81,
|
||||||
|
|
||||||
":": 13,
|
":": 13,
|
||||||
";": 14,
|
";": 14,
|
||||||
"?": 15,
|
"?": 15,
|
||||||
|
@ -212,9 +223,7 @@
|
||||||
"̩": 76,
|
"̩": 76,
|
||||||
"θ": 77,
|
"θ": 77,
|
||||||
"ᵻ": 78,
|
"ᵻ": 78,
|
||||||
"—": 79,
|
|
||||||
"“": 80,
|
|
||||||
"”": 81,
|
|
||||||
"…": 82,
|
"…": 82,
|
||||||
"ˈɛ": 83,
|
"ˈɛ": 83,
|
||||||
"iː": 84,
|
"iː": 84,
|
||||||
|
|
|
@ -19,6 +19,20 @@ Unlike the paper, this VALL-E aims to:
|
||||||
|
|
||||||
However, at this point and time, the implementation is rather divorced from VALL-E and its derivating papers, but the core principle is still followed.
|
However, at this point and time, the implementation is rather divorced from VALL-E and its derivating papers, but the core principle is still followed.
|
||||||
|
|
||||||
|
## Model Specifications
|
||||||
|
|
||||||
|
The reference model (`ar+nar-llama-8`/`ar+nar-len-llama-8`):
|
||||||
|
* boasts 220M parameters
|
||||||
|
* supports English, German, French, and Japanese
|
||||||
|
* support for Korean and Chinese (Mandarin?) soon™
|
||||||
|
* has several modalities of inferencing:
|
||||||
|
* the primary audio level (RVQ level 0) can be inferenced both autoregressively (`AR`) or non-autoregressively (`NAR-len`)
|
||||||
|
* pure-NAR can yield faster-than-realtime output
|
||||||
|
* supports predicting the duration of an input
|
||||||
|
* supports Speech-to-Text (although it's a second-class feature)
|
||||||
|
* additional tasks such as noise reduction, speech removal, editing, and voice conversion eventually™ (just need to train on it)
|
||||||
|
* trained on `?` samples / `?` hours of EnCodec-quantized audio at 24KHz
|
||||||
|
|
||||||
## To-Do
|
## To-Do
|
||||||
|
|
||||||
* [x] train and release a serviceable model for finetuning against.
|
* [x] train and release a serviceable model for finetuning against.
|
||||||
|
|
13
docs/emb.md
13
docs/emb.md
|
@ -9,15 +9,18 @@ The `emb` name is a relic of the original implementation used.
|
||||||
This script handles taking text of a given language, and phonemizing into IPAs.
|
This script handles taking text of a given language, and phonemizing into IPAs.
|
||||||
* This is mainly an abstraction to `phonemizer`.
|
* This is mainly an abstraction to `phonemizer`.
|
||||||
|
|
||||||
For Japanese, text is coerced through `pykakasi` into kana, then phonemized, as `phonemizer` does not like kanji.
|
Some additional languages receive preprocessing/postprocessing:
|
||||||
|
* For Japanese, text is coerced through `pykakasi` into hiragana, then phonemized, as `phonemizer` does not like kanji.
|
||||||
|
* For Chinese (Mandarin), the tone markers are replaced with equivalent tone markers to differentiate between being numbered.
|
||||||
|
|
||||||
By default, `espeak` is used as the backend, but other *backends* can be passed through `encode`.
|
By default:
|
||||||
|
* `espeak` is used as the backend, but other *backends* can be passed through `encode`.
|
||||||
By default, punctuation, stress markers, and stripping are enabled by default, but *can* be disabled.
|
* punctuation, stress markers, and stripping are enabled by default, but *can* be disabled.
|
||||||
|
* language for a given text is automatically deduced with `langdetect`, if language is not provided (or `auto` is passed).
|
||||||
|
|
||||||
To avoid memory leaking through `phonemizer`, backends and instances are cached for further reuse.
|
To avoid memory leaking through `phonemizer`, backends and instances are cached for further reuse.
|
||||||
|
|
||||||
The language for a given text can be automatically deduced with `langdetect` by passing `auto` as a language.
|
Unfortunately, there are some quirks with this method, as contextually things like "read" and "lead" do not rhyme with "said".
|
||||||
|
|
||||||
### Text Tokens
|
### Text Tokens
|
||||||
|
|
||||||
|
|
|
@ -521,13 +521,19 @@ def get_lang_symmap():
|
||||||
"ja": 1,
|
"ja": 1,
|
||||||
"de": 2,
|
"de": 2,
|
||||||
"fr": 3,
|
"fr": 3,
|
||||||
|
"zh": 4, # mandarin I presume
|
||||||
|
"ko": 5,
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_tone_symmap():
|
def get_tone_symmap():
|
||||||
return {
|
return {
|
||||||
"neutral": 0,
|
"neutral": 0,
|
||||||
|
# could use 4 instead of 8 basic emotions
|
||||||
|
# "joy": 1,
|
||||||
|
# "fear": 2,
|
||||||
|
# "surprise": 3,
|
||||||
|
# "anger": 4,
|
||||||
}
|
}
|
||||||
return symmap
|
|
||||||
|
|
||||||
def get_task_symmap():
|
def get_task_symmap():
|
||||||
return {
|
return {
|
||||||
|
|
|
@ -35,7 +35,7 @@ def _get_graphs(path):
|
||||||
return graphs
|
return graphs
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def romanize( runes, sep="" ):
|
def coerce_to_hiragana( runes, sep="" ):
|
||||||
if pykakasi is None:
|
if pykakasi is None:
|
||||||
raise Exception('pykakasi is not installed.')
|
raise Exception('pykakasi is not installed.')
|
||||||
|
|
||||||
|
@ -43,13 +43,23 @@ def romanize( runes, sep="" ):
|
||||||
result = kks.convert( runes )
|
result = kks.convert( runes )
|
||||||
return sep.join([ res['hira'] for res in result ])
|
return sep.join([ res['hira'] for res in result ])
|
||||||
|
|
||||||
# to-do: fill out this table
|
|
||||||
# although desu the only thing that might be needed are en-uk/en-gb, es-la, pt-br, and pt-pt
|
|
||||||
def coerce_language( lang ):
|
def coerce_language( lang ):
|
||||||
|
# bottle of water vs bo'oh'o'wa'er
|
||||||
if lang == "en":
|
if lang == "en":
|
||||||
lang = "en-us"
|
lang = "en-us"
|
||||||
|
# quebec probably
|
||||||
if lang == "fr":
|
if lang == "fr":
|
||||||
return "fr-fr"
|
return "fr-fr"
|
||||||
|
# phonemizer/espeak used to have zh refer to mandarin, but was renamed to cmn
|
||||||
|
# cmn outputs cringe, but not cmn-latn-pinyin
|
||||||
|
if lang == "zh":
|
||||||
|
return "cmn-latn-pinyin"
|
||||||
|
"""
|
||||||
|
things to consider in the future
|
||||||
|
en-uk or en-gb
|
||||||
|
es-la vs es-es
|
||||||
|
pt-br vs pt-pt
|
||||||
|
"""
|
||||||
return lang
|
return lang
|
||||||
|
|
||||||
cached_backends = {}
|
cached_backends = {}
|
||||||
|
@ -75,23 +85,45 @@ def encode(text: str, language="auto", backend="auto", punctuation=True, stress=
|
||||||
|
|
||||||
language = coerce_language( language )
|
language = coerce_language( language )
|
||||||
|
|
||||||
# Convert to kana because espeak does not like kanji...
|
#
|
||||||
if language[:2] == "ja" and backend == "auto":
|
if backend == "auto":
|
||||||
text = romanize( text )
|
# Convert to hiragana, as espeak does not like kanji
|
||||||
|
if language[:2] == "ja":
|
||||||
|
text = coerce_to_hiragana( text )
|
||||||
|
|
||||||
|
# "zh" => "cmn-latn-pinyin"
|
||||||
|
elif language == "zh":
|
||||||
|
language = "cmn-latn-pinyin"
|
||||||
|
|
||||||
|
|
||||||
if not backend or backend == "auto":
|
if not backend or backend == "auto":
|
||||||
backend = "espeak" # if language[:2] != "en" else "festival"
|
backend = "espeak" # if language[:2] != "en" else "festival"
|
||||||
|
|
||||||
backend = _get_backend(language=language, backend=backend, stress=stress, strip=strip, punctuation=punctuation)
|
backend = _get_backend(language=language, backend=backend, stress=stress, strip=strip, punctuation=punctuation)
|
||||||
if backend is not None:
|
if backend is not None:
|
||||||
tokens = backend.phonemize( [ text ], strip=strip )
|
phonemes = backend.phonemize( [ text ], strip=strip )
|
||||||
else:
|
else:
|
||||||
tokens = phonemize( [ text ], language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress )
|
phonemes = phonemize( [ text ], language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress )
|
||||||
|
|
||||||
if not len(tokens):
|
if not len(phonemes):
|
||||||
raise Exception(f"Failed to phonemize, received empty string: {text}")
|
raise Exception(f"Failed to phonemize, received empty string: {text}")
|
||||||
|
|
||||||
return tokens[0]
|
phonemes = phonemes[0]
|
||||||
|
|
||||||
|
# remap tones
|
||||||
|
# technically they can be kept in place and just update the tokenizer, but this would be a bit confusing
|
||||||
|
if language == "cmn-latn-pinyin":
|
||||||
|
tones = {
|
||||||
|
"1": "ˇ",
|
||||||
|
"2": "ˉ",
|
||||||
|
"3": "ˊ",
|
||||||
|
"4": "ˋ",
|
||||||
|
"5": "_",
|
||||||
|
}
|
||||||
|
for k, v in tones.items():
|
||||||
|
phonemes = phonemes.replace(k, v)
|
||||||
|
|
||||||
|
return phonemes
|
||||||
|
|
||||||
# Helper function to debug phonemizer
|
# Helper function to debug phonemizer
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
Loading…
Reference in New Issue
Block a user