regex replace out the (lang) markers in espeak, updated tokenizer vocab as lazily as possible to not have unk tokens
This commit is contained in:
parent
769f67dcfe
commit
d31f27119a
|
@ -116,12 +116,16 @@
|
|||
"<mask>": 3,
|
||||
" ": 4,
|
||||
|
||||
"ᵝ": 0,
|
||||
"ᵝ": 4,
|
||||
|
||||
"!": 5,
|
||||
"\"": 6,
|
||||
"(": 7,
|
||||
"{": 4,
|
||||
"[": 4,
|
||||
")": 8,
|
||||
"}": 8,
|
||||
"]": 8,
|
||||
",": 9,
|
||||
"-": 10,
|
||||
".": 11,
|
||||
|
@ -131,6 +135,7 @@
|
|||
"?": 15,
|
||||
"a": 16,
|
||||
"ä": 16,
|
||||
"ɒ": 16,
|
||||
"b": 17,
|
||||
"c": 18,
|
||||
"d": 19,
|
||||
|
@ -145,6 +150,7 @@
|
|||
"m": 27,
|
||||
"n": 28,
|
||||
"ɴ": 28,
|
||||
"ɲ": 28,
|
||||
"o": 29,
|
||||
"̞": 29,
|
||||
"p": 30,
|
||||
|
@ -152,9 +158,14 @@
|
|||
"q": 31,
|
||||
"r": 32,
|
||||
"ɽ": 32,
|
||||
"ʁ": 32,
|
||||
"s": 33,
|
||||
"t": 34,
|
||||
"u": 35,
|
||||
"ø": 35,
|
||||
"œ": 35,
|
||||
"y": 35,
|
||||
"ɣ": 35,
|
||||
"ũ": 35,
|
||||
"v": 36,
|
||||
"w": 37,
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
import copy
|
||||
import h5py
|
||||
import json
|
||||
import re
|
||||
import logging
|
||||
import numpy as np
|
||||
import os
|
||||
|
@ -1370,8 +1371,9 @@ def process_artifact_metadata( artifact ):
|
|||
# to-do: regex replace /([a-z]{2})/ to ""
|
||||
if "phonemes" in metadata:
|
||||
metadata["phonemes"] = metadata["phonemes"].replace("(en)", "")
|
||||
if "phonemes" in metadata and "language" in metadata:
|
||||
metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "")
|
||||
if "language" in metadata:
|
||||
metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "")
|
||||
metadata["phonemes"] = re.sub(r'\([a-z]{2}\)', "", metadata["phonemes"])
|
||||
|
||||
return metadata
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user