some insanity for sanity checks (some phonemes from phonemizing japanese are not in my tokenizer...)
This commit is contained in:
parent
ad024f400f
commit
491ae2a684
|
@ -115,6 +115,9 @@
|
||||||
"</eos>": 2,
|
"</eos>": 2,
|
||||||
"<mask>": 3,
|
"<mask>": 3,
|
||||||
" ": 4,
|
" ": 4,
|
||||||
|
|
||||||
|
"ᵝ": 0,
|
||||||
|
|
||||||
"!": 5,
|
"!": 5,
|
||||||
"\"": 6,
|
"\"": 6,
|
||||||
"(": 7,
|
"(": 7,
|
||||||
|
@ -127,6 +130,7 @@
|
||||||
";": 14,
|
";": 14,
|
||||||
"?": 15,
|
"?": 15,
|
||||||
"a": 16,
|
"a": 16,
|
||||||
|
"ä": 16,
|
||||||
"b": 17,
|
"b": 17,
|
||||||
"c": 18,
|
"c": 18,
|
||||||
"d": 19,
|
"d": 19,
|
||||||
|
@ -134,20 +138,27 @@
|
||||||
"f": 21,
|
"f": 21,
|
||||||
"h": 22,
|
"h": 22,
|
||||||
"i": 23,
|
"i": 23,
|
||||||
|
"ĩ": 23,
|
||||||
"j": 24,
|
"j": 24,
|
||||||
"k": 25,
|
"k": 25,
|
||||||
"l": 26,
|
"l": 26,
|
||||||
"m": 27,
|
"m": 27,
|
||||||
"n": 28,
|
"n": 28,
|
||||||
|
"ɴ": 28,
|
||||||
"o": 29,
|
"o": 29,
|
||||||
|
"̞": 29,
|
||||||
"p": 30,
|
"p": 30,
|
||||||
|
"ɸ": 30,
|
||||||
"q": 31,
|
"q": 31,
|
||||||
"r": 32,
|
"r": 32,
|
||||||
|
"ɽ": 32,
|
||||||
"s": 33,
|
"s": 33,
|
||||||
"t": 34,
|
"t": 34,
|
||||||
"u": 35,
|
"u": 35,
|
||||||
|
"ũ": 35,
|
||||||
"v": 36,
|
"v": 36,
|
||||||
"w": 37,
|
"w": 37,
|
||||||
|
"ʍ": 37,
|
||||||
"x": 38,
|
"x": 38,
|
||||||
"z": 39,
|
"z": 39,
|
||||||
"¡": 40,
|
"¡": 40,
|
||||||
|
@ -184,6 +195,7 @@
|
||||||
"ʲ": 71,
|
"ʲ": 71,
|
||||||
"ˈ": 72,
|
"ˈ": 72,
|
||||||
"ˌ": 73,
|
"ˌ": 73,
|
||||||
|
"ˌ": 73,
|
||||||
"ː": 74,
|
"ː": 74,
|
||||||
"̃": 75,
|
"̃": 75,
|
||||||
"̩": 76,
|
"̩": 76,
|
||||||
|
|
|
@ -1505,7 +1505,7 @@ if __name__ == "__main__":
|
||||||
samples = {
|
samples = {
|
||||||
"training": [ next(iter(train_dl)), next(iter(train_dl)) ],
|
"training": [ next(iter(train_dl)), next(iter(train_dl)) ],
|
||||||
"evaluation": [ next(iter(subtrain_dl)), next(iter(subtrain_dl)) ],
|
"evaluation": [ next(iter(subtrain_dl)), next(iter(subtrain_dl)) ],
|
||||||
"validation": [ next(iter(val_dl)), next(iter(val_dl)) ],
|
#"validation": [ next(iter(val_dl)), next(iter(val_dl)) ],
|
||||||
}
|
}
|
||||||
|
|
||||||
Path("./data/sample-test/").mkdir(parents=True, exist_ok=True)
|
Path("./data/sample-test/").mkdir(parents=True, exist_ok=True)
|
||||||
|
@ -1529,6 +1529,30 @@ if __name__ == "__main__":
|
||||||
for k, v in samples.items():
|
for k, v in samples.items():
|
||||||
for i in range(len(v)):
|
for i in range(len(v)):
|
||||||
print(f'{k}[{i}]:', v[i])
|
print(f'{k}[{i}]:', v[i])
|
||||||
|
elif args.action == "validate":
|
||||||
|
train_dl, subtrain_dl, val_dl = create_train_val_dataloader()
|
||||||
|
|
||||||
|
missing = set()
|
||||||
|
|
||||||
|
for i in range(len( train_dl.dataset )):
|
||||||
|
batch = train_dl.dataset[i]
|
||||||
|
|
||||||
|
text = batch['text']
|
||||||
|
phonemes = batch['metadata']['phonemes']
|
||||||
|
|
||||||
|
decoded = [ cfg.tokenizer.decode(token) for token in text[1:-1] ]
|
||||||
|
for i, token in enumerate(decoded):
|
||||||
|
if token != "<unk>":
|
||||||
|
continue
|
||||||
|
|
||||||
|
phone = phonemes[i]
|
||||||
|
|
||||||
|
print( batch['text'], batch['metadata']['phonemes'] )
|
||||||
|
|
||||||
|
missing |= set([phone])
|
||||||
|
|
||||||
|
print( "Missing tokens:", missing )
|
||||||
|
|
||||||
|
|
||||||
elif args.action == "tasks":
|
elif args.action == "tasks":
|
||||||
index = 0
|
index = 0
|
||||||
|
|
|
@ -27,23 +27,23 @@ def romanize( runes, sep="" ):
|
||||||
return sep.join([ res['hira'] for res in result ])
|
return sep.join([ res['hira'] for res in result ])
|
||||||
|
|
||||||
cached_backends = {}
|
cached_backends = {}
|
||||||
def _get_backend( language="en-us", backend="espeak" ):
|
def _get_backend( language="en-us", backend="espeak", punctuation=True, stress=True, strip=True ):
|
||||||
key = f'{language}_{backend}'
|
key = f'{language}_{backend}'
|
||||||
if key in cached_backends:
|
if key in cached_backends:
|
||||||
return cached_backends[key]
|
return cached_backends[key]
|
||||||
|
|
||||||
if backend == 'espeak':
|
if backend == 'espeak':
|
||||||
phonemizer = BACKENDS[backend]( language, preserve_punctuation=True, with_stress=True)
|
phonemizer = BACKENDS[backend]( language, preserve_punctuation=punctuation, with_stress=stress)
|
||||||
elif backend == 'espeak-mbrola':
|
elif backend == 'espeak-mbrola':
|
||||||
phonemizer = BACKENDS[backend]( language )
|
phonemizer = BACKENDS[backend]( language )
|
||||||
else:
|
else:
|
||||||
phonemizer = BACKENDS[backend]( language, preserve_punctuation=True )
|
phonemizer = BACKENDS[backend]( language, preserve_punctuation=punctuation )
|
||||||
|
|
||||||
cached_backends[key] = phonemizer
|
cached_backends[key] = phonemizer
|
||||||
return phonemizer
|
return phonemizer
|
||||||
|
|
||||||
|
|
||||||
def encode(text: str, language="en-us", backend="auto") -> list[str]:
|
def encode(text: str, language="en-us", backend="auto", punctuation=True, stress=True, strip=True) -> list[str]:
|
||||||
if language == "en":
|
if language == "en":
|
||||||
language = "en-us"
|
language = "en-us"
|
||||||
|
|
||||||
|
@ -56,13 +56,15 @@ def encode(text: str, language="en-us", backend="auto") -> list[str]:
|
||||||
|
|
||||||
text = [ text ]
|
text = [ text ]
|
||||||
|
|
||||||
backend = _get_backend(language=language, backend=backend)
|
backend = _get_backend(language=language, backend=backend, stress=stress, strip=strip, punctuation=punctuation)
|
||||||
if backend is not None:
|
if backend is not None:
|
||||||
tokens = backend.phonemize( text, strip=True )
|
tokens = backend.phonemize( text, strip=strip )
|
||||||
else:
|
else:
|
||||||
tokens = phonemize( text, language=language, strip=True, preserve_punctuation=True, with_stress=True )
|
tokens = phonemize( text, language=language, strip=strip, preserve_punctuation=punctuation, with_stress=stress )
|
||||||
|
|
||||||
|
if not len(tokens):
|
||||||
tokens = list(tokens[0])
|
tokens = []
|
||||||
|
else:
|
||||||
|
tokens = list(tokens[0])
|
||||||
|
|
||||||
return tokens
|
return tokens
|
Loading…
Reference in New Issue
Block a user