instead just compute a bunch of stuff on the transcriptions to store later in different names so I can just retrieve what I want, also added tongue twisters for nefarious reasons
This commit is contained in:
parent
4775edaa41
commit
7617b6485f
23
data/tongue_twisters.txt
Normal file
23
data/tongue_twisters.txt
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
Six sick hicks nick six slick bricks with picks and sticks.
|
||||||
|
Fresh French fried fly fritters.
|
||||||
|
Rory the warrior and Roger the worrier were reared wrongly in a rural brewery.
|
||||||
|
Which wrist watches are Swiss wrist watches?
|
||||||
|
Fred fed Ted bread and Ted fed Fred bread.
|
||||||
|
The 33 thieves thought that they thrilled the throne throughout Thursday.
|
||||||
|
You know New York, you need New York, you know you need unique New York.
|
||||||
|
Lesser leather never weathered wetter weather better.
|
||||||
|
The sixth sick sheikh’s sixth sheep’s sick.
|
||||||
|
A skunk sat on a stump and thunk the stump stunk, but the stump thunk the skunk stunk.
|
||||||
|
Thirty-three thirsty, thundering thoroughbreds thumped Mr. Thurber on Thursday.
|
||||||
|
Wayne went to wales to watch walruses.
|
||||||
|
Seventy-seven benevolent elephants.
|
||||||
|
Send toast to ten tense stout saints’ ten tall tents.
|
||||||
|
I slit the sheet, the sheet I slit, and on the slitted sheet I sit.
|
||||||
|
Give papa a cup of proper coffee in a copper coffee cup.
|
||||||
|
She sells seashells by the seashore.
|
||||||
|
Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
|
||||||
|
Pad kid poured curd pulled cod.
|
||||||
|
Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn’t very fuzzy, was he?
|
||||||
|
Supercalifragilisticexpialidocious.
|
||||||
|
How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.
|
||||||
|
Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.
|
|
@ -192,7 +192,7 @@ def normalize_text(text, language="auto", full=True):
|
||||||
return text
|
return text
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
def get_random_prompts( validation=False, min_length=0, tokenized=False ):
|
def get_random_prompts( validation=False, min_length=0, tokenized=False, source_path=Path("./data/harvard_sentences.txt") ):
|
||||||
duration_range = [ 5.5, 12.0 ] # to-do: pull from cfg.dataset.duration_range
|
duration_range = [ 5.5, 12.0 ] # to-do: pull from cfg.dataset.duration_range
|
||||||
sentences = [
|
sentences = [
|
||||||
"The birch canoe slid on the smooth planks.",
|
"The birch canoe slid on the smooth planks.",
|
||||||
|
@ -228,9 +228,8 @@ def get_random_prompts( validation=False, min_length=0, tokenized=False ):
|
||||||
"Perfect. Please move quickly to the chamber lock, as the effect of prolonged exposure to the button are not part of this test.",
|
"Perfect. Please move quickly to the chamber lock, as the effect of prolonged exposure to the button are not part of this test.",
|
||||||
]
|
]
|
||||||
|
|
||||||
harvard_sentences_path = Path("./data/harvard_sentences.txt")
|
if source_path.exists():
|
||||||
if harvard_sentences_path.exists():
|
sentences = open( source_path, "r", encoding="utf-8" ).read().split("\n")
|
||||||
sentences = open( harvard_sentences_path, "r", encoding="utf-8" ).read().split("\n")
|
|
||||||
|
|
||||||
# Pull from validation dataset if existing + requested
|
# Pull from validation dataset if existing + requested
|
||||||
if validation and cfg.dataset.validation:
|
if validation and cfg.dataset.validation:
|
||||||
|
|
|
@ -426,15 +426,20 @@ def main():
|
||||||
calculate = not metrics_path.exists() or (metrics_path.stat().st_mtime < out_path.stat().st_mtime)
|
calculate = not metrics_path.exists() or (metrics_path.stat().st_mtime < out_path.stat().st_mtime)
|
||||||
|
|
||||||
if calculate:
|
if calculate:
|
||||||
wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=True )
|
# computes based on word transcriptions outright
|
||||||
#wer_score, cer_score = wer( out_path, reference_path, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False )
|
wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False )
|
||||||
|
# compute on words as well, but does not normalize
|
||||||
|
wer_un_score, cer_un_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False, normalize=False )
|
||||||
|
# computes on phonemes instead
|
||||||
|
pwer_score, per_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=True )
|
||||||
|
|
||||||
sim_o_score = sim_o( out_path, prompt_path, device=tts.device, dtype=tts.dtype, model_name=args.speaker_similarity_model )
|
sim_o_score = sim_o( out_path, prompt_path, device=tts.device, dtype=tts.dtype, model_name=args.speaker_similarity_model )
|
||||||
|
|
||||||
metrics = {"wer": wer_score, "cer": cer_score, "sim-o": sim_o_score}
|
metrics = {"wer": wer_score, "cer": cer_score, "sim-o": sim_o_score, "per": per_score, "pwer": pwer_score, "wer_un": wer_un_score, "cer_un": cer_un_score }
|
||||||
json_write( metrics, metrics_path )
|
json_write( metrics, metrics_path )
|
||||||
else:
|
else:
|
||||||
metrics = json_read( metrics_path )
|
metrics = json_read( metrics_path )
|
||||||
wer_score, cer_score, sim_o_score = metrics["wer"], metrics["cer"], metrics["sim-o"]
|
wer_score, cer_score, per_score, sim_o_score = metrics["wer"], metrics["cer"], metrics["per"], metrics["sim-o"]
|
||||||
|
|
||||||
if dataset_name not in metrics_map:
|
if dataset_name not in metrics_map:
|
||||||
metrics_map[dataset_name] = {}
|
metrics_map[dataset_name] = {}
|
||||||
|
@ -444,11 +449,11 @@ def main():
|
||||||
# collate entries into HTML
|
# collate entries into HTML
|
||||||
tables = []
|
tables = []
|
||||||
for dataset_name, samples in outputs:
|
for dataset_name, samples in outputs:
|
||||||
table = "\t\t<h3>${DATASET_NAME}</h3>\n\t\t<p><b>Average WER:</b> ${WER}<br><b>Average CER:</b> ${CER}<br><b>Average SIM-O:</b> ${SIM-O}<br></p>\n\t\t<table>\n\t\t\t<thead>\n\t\t\t\t<tr>\n\t\t\t\t\t<th>Text</th>\n\t\t\t\t\t<th>WER↓</th>\n\t\t\t\t\t<th>CER↓</th>\n\t\t\t\t\t<th>SIM-O↑</th>\n\t\t\t\t\t<th>Prompt</th>\n\t\t\t\t\t<th>Our VALL-E</th>\n\t\t\t\t\t<!--th>Original VALL-E</th-->\n\t\t\t\t\t<!--th>F5-TTS</th-->\n\t\t\t\t\t<th>Ground Truth</th>\n\t\t\t\t</tr>\n\t\t\t</thead>\n\t\t\t<tbody>${SAMPLES}</tbody>\n\t\t</table>"
|
table = "\t\t<h3>${DATASET_NAME}</h3>\n\t\t<p><b>Average WER:</b> ${WER}<br><b>Average CER:</b> ${CER}<br><b>Average PER:</b> ${PER}<br><b>Average SIM-O:</b> ${SIM-O}<br></p>\n\t\t<table>\n\t\t\t<thead>\n\t\t\t\t<tr>\n\t\t\t\t\t<th>Text</th>\n\t\t\t\t\t<th>WER↓</th>\n\t\t\t\t\t<th>CER↓</th>\n\t\t\t\t\t<th>SIM-O↑</th>\n\t\t\t\t\t<th>Prompt</th>\n\t\t\t\t\t<th>Our VALL-E</th>\n\t\t\t\t\t<!--th>Original VALL-E</th-->\n\t\t\t\t\t<!--th>F5-TTS</th-->\n\t\t\t\t\t<th>Ground Truth</th>\n\t\t\t\t</tr>\n\t\t\t</thead>\n\t\t\t<tbody>${SAMPLES}</tbody>\n\t\t</table>"
|
||||||
samples = [
|
samples = [
|
||||||
f'\n\t\t\t<tr>\n\t\t\t\t<td>{text}</td>'+
|
f'\n\t\t\t<tr>\n\t\t\t\t<td>{text}</td>'+
|
||||||
"".join([
|
"".join([
|
||||||
f'\n\t\t\t\t<td>{metrics_map[dataset_name][audios[1]][0]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][1]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][2]:.3f}</td>'
|
f'\n\t\t\t\t<td>{metrics_map[dataset_name][audios[1]][0]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][1]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][2]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][3]:.3f}</td>'
|
||||||
] ) +
|
] ) +
|
||||||
"".join( [
|
"".join( [
|
||||||
f'\n\t\t\t\t<td><audio controls="controls" preload="none"><source src="{str(audio).replace(str(args.demo_dir), args.audio_path_root) if args.audio_path_root else encode(audio)}"/></audio></td>'
|
f'\n\t\t\t\t<td><audio controls="controls" preload="none"><source src="{str(audio).replace(str(args.demo_dir), args.audio_path_root) if args.audio_path_root else encode(audio)}"/></audio></td>'
|
||||||
|
@ -461,7 +466,8 @@ def main():
|
||||||
# write audio into template
|
# write audio into template
|
||||||
table = table.replace("${WER}", f'{mean([ metrics[0] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
|
table = table.replace("${WER}", f'{mean([ metrics[0] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
|
||||||
table = table.replace("${CER}", f'{mean([ metrics[1] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
|
table = table.replace("${CER}", f'{mean([ metrics[1] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
|
||||||
table = table.replace("${SIM-O}", f'{mean([ metrics[2] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
|
table = table.replace("${PER}", f'{mean([ metrics[2] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
|
||||||
|
table = table.replace("${SIM-O}", f'{mean([ metrics[3] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
|
||||||
|
|
||||||
table = table.replace("${DATASET_NAME}", dataset_name)
|
table = table.replace("${DATASET_NAME}", dataset_name)
|
||||||
table = table.replace("${SAMPLES}", "\n".join( samples ) )
|
table = table.replace("${SAMPLES}", "\n".join( samples ) )
|
||||||
|
|
|
@ -16,7 +16,7 @@ import warnings
|
||||||
warnings.simplefilter(action='ignore', category=FutureWarning)
|
warnings.simplefilter(action='ignore', category=FutureWarning)
|
||||||
warnings.simplefilter(action='ignore', category=UserWarning)
|
warnings.simplefilter(action='ignore', category=UserWarning)
|
||||||
|
|
||||||
def wer( audio, reference, language="auto", phonemize=True, **transcription_kwargs ):
|
def wer( audio, reference, language="auto", phonemize=True, normalize=True, **transcription_kwargs ):
|
||||||
if language == "auto":
|
if language == "auto":
|
||||||
language = detect_language( reference )
|
language = detect_language( reference )
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ def wer( audio, reference, language="auto", phonemize=True, **transcription_kwar
|
||||||
if phonemize:
|
if phonemize:
|
||||||
transcription = encode( transcription, language=language )
|
transcription = encode( transcription, language=language )
|
||||||
reference = encode( reference, language=language )
|
reference = encode( reference, language=language )
|
||||||
else:
|
elif normalize:
|
||||||
transcription = normalize_text( transcription, language=language )
|
transcription = normalize_text( transcription, language=language )
|
||||||
reference = normalize_text( reference, language=language )
|
reference = normalize_text( reference, language=language )
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user