regex replace out the (lang) markers in espeak, updated tokenizer vocab as lazily as possible to not have unk tokens

This commit is contained in:
mrq 2024-09-21 12:29:28 -05:00
parent 769f67dcfe
commit d31f27119a
2 changed files with 16 additions and 3 deletions

View File

@ -116,12 +116,16 @@
"<mask>": 3,
" ": 4,
"ᵝ": 0,
"ᵝ": 4,
"!": 5,
"\"": 6,
"(": 7,
"{": 4,
"[": 4,
")": 8,
"}": 8,
"]": 8,
",": 9,
"-": 10,
".": 11,
@ -131,6 +135,7 @@
"?": 15,
"a": 16,
"ä": 16,
"ɒ": 16,
"b": 17,
"c": 18,
"d": 19,
@ -145,6 +150,7 @@
"m": 27,
"n": 28,
"ɴ": 28,
"ɲ": 28,
"o": 29,
"̞": 29,
"p": 30,
@ -152,9 +158,14 @@
"q": 31,
"r": 32,
"ɽ": 32,
"ʁ": 32,
"s": 33,
"t": 34,
"u": 35,
"ø": 35,
"œ": 35,
"y": 35,
"ɣ": 35,
"ũ": 35,
"v": 36,
"w": 37,

View File

@ -3,6 +3,7 @@
import copy
import h5py
import json
import re
import logging
import numpy as np
import os
@ -1370,8 +1371,9 @@ def process_artifact_metadata( artifact ):
# to-do: regex replace /([a-z]{2})/ to ""
if "phonemes" in metadata:
metadata["phonemes"] = metadata["phonemes"].replace("(en)", "")
if "phonemes" in metadata and "language" in metadata:
metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "")
if "language" in metadata:
metadata["phonemes"] = metadata["phonemes"].replace(f"({metadata['language']})", "")
metadata["phonemes"] = re.sub(r'\([a-z]{2}\)', "", metadata["phonemes"])
return metadata