actually pass language into dataset process script, fix coercing japanese into hiragana because espeak does not like kanji

This commit is contained in:
mrq 2024-07-21 23:21:37 -05:00
parent 3e5ca3a201
commit ad024f400f
4 changed files with 23 additions and 13 deletions

View File

@ -185,7 +185,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
try:
outpath, text, language, waveform, sample_rate = job
phones = valle_phonemize(text)
phones = valle_phonemize( text, language=language )
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
if cfg.audio_backend == "dac":

View File

@ -25,7 +25,7 @@ def main():
parser.add_argument("--input-prompt-length", type=float, default=3.0)
parser.add_argument("--top-p", type=float, default=1.0)
parser.add_argument("--top-k", type=int, default=16)
parser.add_argument("--top-k", type=int, default=0)
parser.add_argument("--repetition-penalty", type=float, default=1.0)
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
parser.add_argument("--length-penalty", type=float, default=0.0)

View File

@ -56,7 +56,7 @@ def main():
parser.add_argument("--input-prompt-length", type=float, default=0.0)
parser.add_argument("--top-p", type=float, default=1.0)
parser.add_argument("--top-k", type=int, default=16)
parser.add_argument("--top-k", type=int, default=0)
parser.add_argument("--repetition-penalty", type=float, default=1.0)
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
parser.add_argument("--length-penalty", type=float, default=0.0)
@ -108,6 +108,8 @@ def main():
# pull from dataset samples
if args.sample_from_dataset:
cfg.dataset.cache = False
samples_dirs["dataset"] = args.demo_dir / "dataset"
print("Loading dataloader...")
@ -157,13 +159,14 @@ def main():
text = open(dir / "prompt.txt").read()
language = open(dir / "language.txt").read() if (dir / "language.txt").exists() else "en"
prompt = dir / "prompt.wav"
reference = dir / "reference.wav"
out_path = dir / "out" / "ours.wav"
extra_sources = [ dir / "out" / f"{source}.wav" for source in sources ] if k == "librispeech" else []
samples.append((
text,
[ prompt, dir / "reference.wav", out_path ] + extra_sources
[ prompt, reference, out_path ] + extra_sources
))
if args.skip_existing and out_path.exists():

View File

@ -10,12 +10,22 @@ from phonemizer.backend import BACKENDS
from tqdm import tqdm
try:
import pykakasi
except Exception as e:
pass
@cache
def _get_graphs(path):
with open(path, "r") as f:
graphs = f.read()
return graphs
def romanize( runes, sep="" ):
kks = pykakasi.kakasi()
result = kks.convert( runes )
return sep.join([ res['hira'] for res in result ])
cached_backends = {}
def _get_backend( language="en-us", backend="espeak" ):
key = f'{language}_{backend}'
@ -37,6 +47,10 @@ def encode(text: str, language="en-us", backend="auto") -> list[str]:
if language == "en":
language = "en-us"
# Convert to kana because espeak does not like kanji...
if language[:2] == "ja" and backend == "auto":
text = romanize( text )
if not backend or backend == "auto":
backend = "espeak" # if language[:2] != "en" else "festival"
@ -48,14 +62,7 @@ def encode(text: str, language="en-us", backend="auto") -> list[str]:
else:
tokens = phonemize( text, language=language, strip=True, preserve_punctuation=True, with_stress=True )
tokens = list(tokens[0])
return tokens
"""
tokenized = " ".join( tokens )
merges = [ "\u02C8", "\u02CC", "\u02D0" ]
for merge in merges:
tokenized = tokenized.replace( f' {merge}', merge )
return tokenized.split(" ")
"""
return tokens