actually pass language into dataset process script, fix coercing japanese into hiragana because espeak does not like kanji
This commit is contained in:
parent
3e5ca3a201
commit
ad024f400f
|
@ -185,7 +185,7 @@ for dataset_name in sorted(os.listdir(f'./{input_audio}/')):
|
||||||
try:
|
try:
|
||||||
outpath, text, language, waveform, sample_rate = job
|
outpath, text, language, waveform, sample_rate = job
|
||||||
|
|
||||||
phones = valle_phonemize(text)
|
phones = valle_phonemize( text, language=language )
|
||||||
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
|
qnt = valle_quantize(waveform, sr=sample_rate, device=device)
|
||||||
|
|
||||||
if cfg.audio_backend == "dac":
|
if cfg.audio_backend == "dac":
|
||||||
|
|
|
@ -25,7 +25,7 @@ def main():
|
||||||
parser.add_argument("--input-prompt-length", type=float, default=3.0)
|
parser.add_argument("--input-prompt-length", type=float, default=3.0)
|
||||||
|
|
||||||
parser.add_argument("--top-p", type=float, default=1.0)
|
parser.add_argument("--top-p", type=float, default=1.0)
|
||||||
parser.add_argument("--top-k", type=int, default=16)
|
parser.add_argument("--top-k", type=int, default=0)
|
||||||
parser.add_argument("--repetition-penalty", type=float, default=1.0)
|
parser.add_argument("--repetition-penalty", type=float, default=1.0)
|
||||||
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
|
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
|
||||||
parser.add_argument("--length-penalty", type=float, default=0.0)
|
parser.add_argument("--length-penalty", type=float, default=0.0)
|
||||||
|
|
|
@ -56,7 +56,7 @@ def main():
|
||||||
parser.add_argument("--input-prompt-length", type=float, default=0.0)
|
parser.add_argument("--input-prompt-length", type=float, default=0.0)
|
||||||
|
|
||||||
parser.add_argument("--top-p", type=float, default=1.0)
|
parser.add_argument("--top-p", type=float, default=1.0)
|
||||||
parser.add_argument("--top-k", type=int, default=16)
|
parser.add_argument("--top-k", type=int, default=0)
|
||||||
parser.add_argument("--repetition-penalty", type=float, default=1.0)
|
parser.add_argument("--repetition-penalty", type=float, default=1.0)
|
||||||
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
|
parser.add_argument("--repetition-penalty-decay", type=float, default=0.0)
|
||||||
parser.add_argument("--length-penalty", type=float, default=0.0)
|
parser.add_argument("--length-penalty", type=float, default=0.0)
|
||||||
|
@ -108,6 +108,8 @@ def main():
|
||||||
|
|
||||||
# pull from dataset samples
|
# pull from dataset samples
|
||||||
if args.sample_from_dataset:
|
if args.sample_from_dataset:
|
||||||
|
cfg.dataset.cache = False
|
||||||
|
|
||||||
samples_dirs["dataset"] = args.demo_dir / "dataset"
|
samples_dirs["dataset"] = args.demo_dir / "dataset"
|
||||||
|
|
||||||
print("Loading dataloader...")
|
print("Loading dataloader...")
|
||||||
|
@ -157,13 +159,14 @@ def main():
|
||||||
text = open(dir / "prompt.txt").read()
|
text = open(dir / "prompt.txt").read()
|
||||||
language = open(dir / "language.txt").read() if (dir / "language.txt").exists() else "en"
|
language = open(dir / "language.txt").read() if (dir / "language.txt").exists() else "en"
|
||||||
prompt = dir / "prompt.wav"
|
prompt = dir / "prompt.wav"
|
||||||
|
reference = dir / "reference.wav"
|
||||||
out_path = dir / "out" / "ours.wav"
|
out_path = dir / "out" / "ours.wav"
|
||||||
|
|
||||||
extra_sources = [ dir / "out" / f"{source}.wav" for source in sources ] if k == "librispeech" else []
|
extra_sources = [ dir / "out" / f"{source}.wav" for source in sources ] if k == "librispeech" else []
|
||||||
|
|
||||||
samples.append((
|
samples.append((
|
||||||
text,
|
text,
|
||||||
[ prompt, dir / "reference.wav", out_path ] + extra_sources
|
[ prompt, reference, out_path ] + extra_sources
|
||||||
))
|
))
|
||||||
|
|
||||||
if args.skip_existing and out_path.exists():
|
if args.skip_existing and out_path.exists():
|
||||||
|
|
|
@ -10,12 +10,22 @@ from phonemizer.backend import BACKENDS
|
||||||
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pykakasi
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def _get_graphs(path):
|
def _get_graphs(path):
|
||||||
with open(path, "r") as f:
|
with open(path, "r") as f:
|
||||||
graphs = f.read()
|
graphs = f.read()
|
||||||
return graphs
|
return graphs
|
||||||
|
|
||||||
|
def romanize( runes, sep="" ):
|
||||||
|
kks = pykakasi.kakasi()
|
||||||
|
result = kks.convert( runes )
|
||||||
|
return sep.join([ res['hira'] for res in result ])
|
||||||
|
|
||||||
cached_backends = {}
|
cached_backends = {}
|
||||||
def _get_backend( language="en-us", backend="espeak" ):
|
def _get_backend( language="en-us", backend="espeak" ):
|
||||||
key = f'{language}_{backend}'
|
key = f'{language}_{backend}'
|
||||||
|
@ -37,6 +47,10 @@ def encode(text: str, language="en-us", backend="auto") -> list[str]:
|
||||||
if language == "en":
|
if language == "en":
|
||||||
language = "en-us"
|
language = "en-us"
|
||||||
|
|
||||||
|
# Convert to kana because espeak does not like kanji...
|
||||||
|
if language[:2] == "ja" and backend == "auto":
|
||||||
|
text = romanize( text )
|
||||||
|
|
||||||
if not backend or backend == "auto":
|
if not backend or backend == "auto":
|
||||||
backend = "espeak" # if language[:2] != "en" else "festival"
|
backend = "espeak" # if language[:2] != "en" else "festival"
|
||||||
|
|
||||||
|
@ -48,14 +62,7 @@ def encode(text: str, language="en-us", backend="auto") -> list[str]:
|
||||||
else:
|
else:
|
||||||
tokens = phonemize( text, language=language, strip=True, preserve_punctuation=True, with_stress=True )
|
tokens = phonemize( text, language=language, strip=True, preserve_punctuation=True, with_stress=True )
|
||||||
|
|
||||||
|
|
||||||
tokens = list(tokens[0])
|
tokens = list(tokens[0])
|
||||||
return tokens
|
|
||||||
"""
|
|
||||||
tokenized = " ".join( tokens )
|
|
||||||
|
|
||||||
merges = [ "\u02C8", "\u02CC", "\u02D0" ]
|
return tokens
|
||||||
for merge in merges:
|
|
||||||
tokenized = tokenized.replace( f' {merge}', merge )
|
|
||||||
|
|
||||||
return tokenized.split(" ")
|
|
||||||
"""
|
|
Loading…
Reference in New Issue
Block a user