diff --git a/data/tokenizer.json b/data/tokenizer.json index ad1f0cc..3370336 100644 --- a/data/tokenizer.json +++ b/data/tokenizer.json @@ -129,7 +129,18 @@ ",": 9, "-": 10, ".": 11, - "1": 12, + + "1": 211, + "—": 10, + "“": 6, + "”": 81, + + "ˇ": 6, + "ˉ": 12, + "ˊ": 79, + "ˋ": 80, + "_": 81, + ":": 13, ";": 14, "?": 15, @@ -212,9 +223,7 @@ "̩": 76, "θ": 77, "ᵻ": 78, - "—": 79, - "“": 80, - "”": 81, + "…": 82, "ˈɛ": 83, "iː": 84, diff --git a/docs/README.md b/docs/README.md index 5a7331f..2387ca9 100644 --- a/docs/README.md +++ b/docs/README.md @@ -19,6 +19,20 @@ Unlike the paper, this VALL-E aims to: However, at this point and time, the implementation is rather divorced from VALL-E and its derivating papers, but the core principle is still followed. +## Model Specifications + +The reference model (`ar+nar-llama-8`/`ar+nar-len-llama-8`): +* boasts 220M parameters +* supports English, German, French, and Japanese + * support for Korean and Chinese (Mandarin?) soon™ +* has several modalities of inferencing: + * the primary audio level (RVQ level 0) can be inferenced both autoregressively (`AR`) or non-autoregressively (`NAR-len`) + * pure-NAR can yield faster-than-realtime output + * supports predicting the duration of an input + * supports Speech-to-Text (although it's a second-class feature) + * additional tasks such as noise reduction, speech removal, editing, and voice conversion eventually™ (just need to train on it) +* trained on `?` samples / `?` hours of EnCodec-quantized audio at 24KHz + ## To-Do * [x] train and release a serviceable model for finetuning against. diff --git a/docs/emb.md b/docs/emb.md index 1a88e13..a82ef25 100644 --- a/docs/emb.md +++ b/docs/emb.md @@ -9,15 +9,18 @@ The `emb` name is a relic of the original implementation used. This script handles taking text of a given language, and phonemizing into IPAs. * This is mainly an abstraction to `phonemizer`. -For Japanese, text is coerced through `pykakasi` into kana, then phonemized, as `phonemizer` does not like kanji. +Some additional languages receive preprocessing/postprocessing: + * For Japanese, text is coerced through `pykakasi` into hiragana, then phonemized, as `phonemizer` does not like kanji. + * For Chinese (Mandarin), the tone markers are replaced with equivalent tone markers to differentiate between being numbered. -By default, `espeak` is used as the backend, but other *backends* can be passed through `encode`. - -By default, punctuation, stress markers, and stripping are enabled by default, but *can* be disabled. +By default: + * `espeak` is used as the backend, but other *backends* can be passed through `encode`. + * punctuation, stress markers, and stripping are enabled by default, but *can* be disabled. + * language for a given text is automatically deduced with `langdetect`, if language is not provided (or `auto` is passed). To avoid memory leaking through `phonemizer`, backends and instances are cached for further reuse. -The language for a given text can be automatically deduced with `langdetect` by passing `auto` as a language. +Unfortunately, there are some quirks with this method, as contextually things like "read" and "lead" do not rhyme with "said". ### Text Tokens diff --git a/vall_e/data.py b/vall_e/data.py index 2995723..1ff2610 100755 --- a/vall_e/data.py +++ b/vall_e/data.py @@ -521,13 +521,19 @@ def get_lang_symmap(): "ja": 1, "de": 2, "fr": 3, + "zh": 4, # mandarin I presume + "ko": 5, } def get_tone_symmap(): return { "neutral": 0, + # could use 4 instead of 8 basic emotions + # "joy": 1, + # "fear": 2, + # "surprise": 3, + # "anger": 4, } - return symmap def get_task_symmap(): return { diff --git a/vall_e/emb/g2p.py b/vall_e/emb/g2p.py index 568da26..7e1f5cb 100755 --- a/vall_e/emb/g2p.py +++ b/vall_e/emb/g2p.py @@ -35,7 +35,7 @@ def _get_graphs(path): return graphs @cache -def romanize( runes, sep="" ): +def coerce_to_hiragana( runes, sep="" ): if pykakasi is None: raise Exception('pykakasi is not installed.') @@ -43,13 +43,23 @@ def romanize( runes, sep="" ): result = kks.convert( runes ) return sep.join([ res['hira'] for res in result ]) -# to-do: fill out this table -# although desu the only thing that might be needed are en-uk/en-gb, es-la, pt-br, and pt-pt def coerce_language( lang ): + # bottle of water vs bo'oh'o'wa'er if lang == "en": lang = "en-us" + # quebec probably if lang == "fr": return "fr-fr" + # phonemizer/espeak used to have zh refer to mandarin, but was renamed to cmn + # cmn outputs cringe, but not cmn-latn-pinyin + if lang == "zh": + return "cmn-latn-pinyin" + """ + things to consider in the future + en-uk or en-gb + es-la vs es-es + pt-br vs pt-pt + """ return lang cached_backends = {} @@ -75,23 +85,45 @@ def encode(text: str, language="auto", backend="auto", punctuation=True, stress= language = coerce_language( language ) - # Convert to kana because espeak does not like kanji... - if language[:2] == "ja" and backend == "auto": - text = romanize( text ) + # + if backend == "auto": + # Convert to hiragana, as espeak does not like kanji + if language[:2] == "ja": + text = coerce_to_hiragana( text ) + + # "zh" => "cmn-latn-pinyin" + elif language == "zh": + language = "cmn-latn-pinyin" + if not backend or backend == "auto": backend = "espeak" # if language[:2] != "en" else "festival" backend = _get_backend(language=language, backend=backend, stress=stress, strip=strip, punctuation=punctuation) if backend is not None: - tokens = backend.phonemize( [ text ], strip=strip ) + phonemes = backend.phonemize( [ text ], strip=strip ) else: - tokens = phonemize( [ text ], language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress ) + phonemes = phonemize( [ text ], language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress ) - if not len(tokens): + if not len(phonemes): raise Exception(f"Failed to phonemize, received empty string: {text}") - return tokens[0] + phonemes = phonemes[0] + + # remap tones + # technically they can be kept in place and just update the tokenizer, but this would be a bit confusing + if language == "cmn-latn-pinyin": + tones = { + "1": "ˇ", + "2": "ˉ", + "3": "ˊ", + "4": "ˋ", + "5": "_", + } + for k, v in tones.items(): + phonemes = phonemes.replace(k, v) + + return phonemes # Helper function to debug phonemizer if __name__ == "__main__":