instead just compute a bunch of stuff on the transcriptions to store later in different names so I can just retrieve what I want, also added tongue twisters for nefarious reasons

This commit is contained in:
mrq 2024-12-18 23:43:11 -06:00
parent 4775edaa41
commit 7617b6485f
4 changed files with 41 additions and 13 deletions

23
data/tongue_twisters.txt Normal file
View File

@ -0,0 +1,23 @@
Six sick hicks nick six slick bricks with picks and sticks.
Fresh French fried fly fritters.
Rory the warrior and Roger the worrier were reared wrongly in a rural brewery.
Which wrist watches are Swiss wrist watches?
Fred fed Ted bread and Ted fed Fred bread.
The 33 thieves thought that they thrilled the throne throughout Thursday.
You know New York, you need New York, you know you need unique New York.
Lesser leather never weathered wetter weather better.
The sixth sick sheikhs sixth sheeps sick.
A skunk sat on a stump and thunk the stump stunk, but the stump thunk the skunk stunk.
Thirty-three thirsty, thundering thoroughbreds thumped Mr. Thurber on Thursday.
Wayne went to wales to watch walruses.
Seventy-seven benevolent elephants.
Send toast to ten tense stout saints ten tall tents.
I slit the sheet, the sheet I slit, and on the slitted sheet I sit.
Give papa a cup of proper coffee in a copper coffee cup.
She sells seashells by the seashore.
Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
Pad kid poured curd pulled cod.
Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasnt very fuzzy, was he?
Supercalifragilisticexpialidocious.
How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.
Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.

View File

@ -192,7 +192,7 @@ def normalize_text(text, language="auto", full=True):
return text
@cache
def get_random_prompts( validation=False, min_length=0, tokenized=False ):
def get_random_prompts( validation=False, min_length=0, tokenized=False, source_path=Path("./data/harvard_sentences.txt") ):
duration_range = [ 5.5, 12.0 ] # to-do: pull from cfg.dataset.duration_range
sentences = [
"The birch canoe slid on the smooth planks.",
@ -228,9 +228,8 @@ def get_random_prompts( validation=False, min_length=0, tokenized=False ):
"Perfect. Please move quickly to the chamber lock, as the effect of prolonged exposure to the button are not part of this test.",
]
harvard_sentences_path = Path("./data/harvard_sentences.txt")
if harvard_sentences_path.exists():
sentences = open( harvard_sentences_path, "r", encoding="utf-8" ).read().split("\n")
if source_path.exists():
sentences = open( source_path, "r", encoding="utf-8" ).read().split("\n")
# Pull from validation dataset if existing + requested
if validation and cfg.dataset.validation:

View File

@ -426,15 +426,20 @@ def main():
calculate = not metrics_path.exists() or (metrics_path.stat().st_mtime < out_path.stat().st_mtime)
if calculate:
wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=True )
#wer_score, cer_score = wer( out_path, reference_path, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False )
# computes based on word transcriptions outright
wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False )
# compute on words as well, but does not normalize
wer_un_score, cer_un_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False, normalize=False )
# computes on phonemes instead
pwer_score, per_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=True )
sim_o_score = sim_o( out_path, prompt_path, device=tts.device, dtype=tts.dtype, model_name=args.speaker_similarity_model )
metrics = {"wer": wer_score, "cer": cer_score, "sim-o": sim_o_score}
metrics = {"wer": wer_score, "cer": cer_score, "sim-o": sim_o_score, "per": per_score, "pwer": pwer_score, "wer_un": wer_un_score, "cer_un": cer_un_score }
json_write( metrics, metrics_path )
else:
metrics = json_read( metrics_path )
wer_score, cer_score, sim_o_score = metrics["wer"], metrics["cer"], metrics["sim-o"]
wer_score, cer_score, per_score, sim_o_score = metrics["wer"], metrics["cer"], metrics["per"], metrics["sim-o"]
if dataset_name not in metrics_map:
metrics_map[dataset_name] = {}
@ -444,11 +449,11 @@ def main():
# collate entries into HTML
tables = []
for dataset_name, samples in outputs:
table = "\t\t<h3>${DATASET_NAME}</h3>\n\t\t<p><b>Average WER:</b> ${WER}<br><b>Average CER:</b> ${CER}<br><b>Average SIM-O:</b> ${SIM-O}<br></p>\n\t\t<table>\n\t\t\t<thead>\n\t\t\t\t<tr>\n\t\t\t\t\t<th>Text</th>\n\t\t\t\t\t<th>WER↓</th>\n\t\t\t\t\t<th>CER↓</th>\n\t\t\t\t\t<th>SIM-O↑</th>\n\t\t\t\t\t<th>Prompt</th>\n\t\t\t\t\t<th>Our VALL-E</th>\n\t\t\t\t\t<!--th>Original VALL-E</th-->\n\t\t\t\t\t<!--th>F5-TTS</th-->\n\t\t\t\t\t<th>Ground Truth</th>\n\t\t\t\t</tr>\n\t\t\t</thead>\n\t\t\t<tbody>${SAMPLES}</tbody>\n\t\t</table>"
table = "\t\t<h3>${DATASET_NAME}</h3>\n\t\t<p><b>Average WER:</b> ${WER}<br><b>Average CER:</b> ${CER}<br><b>Average PER:</b> ${PER}<br><b>Average SIM-O:</b> ${SIM-O}<br></p>\n\t\t<table>\n\t\t\t<thead>\n\t\t\t\t<tr>\n\t\t\t\t\t<th>Text</th>\n\t\t\t\t\t<th>WER↓</th>\n\t\t\t\t\t<th>CER↓</th>\n\t\t\t\t\t<th>SIM-O↑</th>\n\t\t\t\t\t<th>Prompt</th>\n\t\t\t\t\t<th>Our VALL-E</th>\n\t\t\t\t\t<!--th>Original VALL-E</th-->\n\t\t\t\t\t<!--th>F5-TTS</th-->\n\t\t\t\t\t<th>Ground Truth</th>\n\t\t\t\t</tr>\n\t\t\t</thead>\n\t\t\t<tbody>${SAMPLES}</tbody>\n\t\t</table>"
samples = [
f'\n\t\t\t<tr>\n\t\t\t\t<td>{text}</td>'+
"".join([
f'\n\t\t\t\t<td>{metrics_map[dataset_name][audios[1]][0]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][1]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][2]:.3f}</td>'
f'\n\t\t\t\t<td>{metrics_map[dataset_name][audios[1]][0]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][1]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][2]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][3]:.3f}</td>'
] ) +
"".join( [
f'\n\t\t\t\t<td><audio controls="controls" preload="none"><source src="{str(audio).replace(str(args.demo_dir), args.audio_path_root) if args.audio_path_root else encode(audio)}"/></audio></td>'
@ -461,7 +466,8 @@ def main():
# write audio into template
table = table.replace("${WER}", f'{mean([ metrics[0] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
table = table.replace("${CER}", f'{mean([ metrics[1] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
table = table.replace("${SIM-O}", f'{mean([ metrics[2] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
table = table.replace("${PER}", f'{mean([ metrics[2] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
table = table.replace("${SIM-O}", f'{mean([ metrics[3] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
table = table.replace("${DATASET_NAME}", dataset_name)
table = table.replace("${SAMPLES}", "\n".join( samples ) )

View File

@ -16,7 +16,7 @@ import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
def wer( audio, reference, language="auto", phonemize=True, **transcription_kwargs ):
def wer( audio, reference, language="auto", phonemize=True, normalize=True, **transcription_kwargs ):
if language == "auto":
language = detect_language( reference )
@ -38,7 +38,7 @@ def wer( audio, reference, language="auto", phonemize=True, **transcription_kwar
if phonemize:
transcription = encode( transcription, language=language )
reference = encode( reference, language=language )
else:
elif normalize:
transcription = normalize_text( transcription, language=language )
reference = normalize_text( reference, language=language )