instead just compute a bunch of stuff on the transcriptions to store later in different names so I can just retrieve what I want, also added tongue twisters for nefarious reasons

2024-12-18 23:43:11 -06:00 · 2024-12-18 23:43:11 -06:00 · 7617b6485f
commit 7617b6485f
parent 4775edaa41
4 changed files with 41 additions and 13 deletions
--- a/data/tongue_twisters.txt
+++ b/data/tongue_twisters.txt
@ -0,0 +1,23 @@
 Six sick hicks nick six slick bricks with picks and sticks.
 Fresh French fried fly fritters.
 Rory the warrior and Roger the worrier were reared wrongly in a rural brewery.
 Which wrist watches are Swiss wrist watches?
 Fred fed Ted bread and Ted fed Fred bread.
 The 33 thieves thought that they thrilled the throne throughout Thursday.
 You know New York, you need New York, you know you need unique New York.
 Lesser leather never weathered wetter weather better.
 The sixth sick sheikh’s sixth sheep’s sick.
 A skunk sat on a stump and thunk the stump stunk, but the stump thunk the skunk stunk.
 Thirty-three thirsty, thundering thoroughbreds thumped Mr. Thurber on Thursday.
 Wayne went to wales to watch walruses.
 Seventy-seven benevolent elephants.
 Send toast to ten tense stout saints’ ten tall tents.
 I slit the sheet, the sheet I slit, and on the slitted sheet I sit.
 Give papa a cup of proper coffee in a copper coffee cup.
 She sells seashells by the seashore.
 Peter Piper picked a peck of pickled peppers. How many pickled peppers did Peter Piper pick?
 Pad kid poured curd pulled cod.
 Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn’t very fuzzy, was he?
 Supercalifragilisticexpialidocious.
 How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.
 Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo.
--- a/vall_e/data.py
+++ b/vall_e/data.py
@ -192,7 +192,7 @@ def normalize_text(text, language="auto", full=True):
 	return text
@cache
-def get_random_prompts( validation=False, min_length=0, tokenized=False ):
+def get_random_prompts( validation=False, min_length=0, tokenized=False, source_path=Path("./data/harvard_sentences.txt") ):
 	duration_range = [ 5.5, 12.0 ] # to-do: pull from cfg.dataset.duration_range
 	sentences = [
 		"The birch canoe slid on the smooth planks.",
@ -228,9 +228,8 @@ def get_random_prompts( validation=False, min_length=0, tokenized=False ):
 		"Perfect. Please move quickly to the chamber lock, as the effect of prolonged exposure to the button are not part of this test.",
 	]
-	harvard_sentences_path = Path("./data/harvard_sentences.txt")
+	if source_path.exists():
-	if harvard_sentences_path.exists():
+		sentences = open( source_path, "r", encoding="utf-8" ).read().split("\n")
 		sentences = open( harvard_sentences_path, "r", encoding="utf-8" ).read().split("\n")
 	# Pull from validation dataset if existing + requested
 	if validation and cfg.dataset.validation:
--- a/vall_e/demo.py
+++ b/vall_e/demo.py
@ -426,15 +426,20 @@ def main():
 		calculate = not metrics_path.exists() or (metrics_path.stat().st_mtime < out_path.stat().st_mtime)
 		if calculate:
-			wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=True )
+			# computes based on word transcriptions outright
-			#wer_score, cer_score = wer( out_path, reference_path, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False )
+			wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False )
 			# compute on words as well, but does not normalize
 			wer_un_score, cer_un_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False, normalize=False )
 			# computes on phonemes instead
 			pwer_score, per_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=True )
 			sim_o_score = sim_o( out_path, prompt_path, device=tts.device, dtype=tts.dtype, model_name=args.speaker_similarity_model )
-			metrics = {"wer": wer_score, "cer": cer_score, "sim-o": sim_o_score}
+			metrics = {"wer": wer_score, "cer": cer_score, "sim-o": sim_o_score, "per": per_score, "pwer": pwer_score, "wer_un": wer_un_score, "cer_un": cer_un_score }
 			json_write( metrics, metrics_path )
 		else:
 			metrics = json_read( metrics_path )
-			wer_score, cer_score, sim_o_score = metrics["wer"], metrics["cer"], metrics["sim-o"]
+			wer_score, cer_score, per_score, sim_o_score = metrics["wer"], metrics["cer"], metrics["per"], metrics["sim-o"]
 		if dataset_name not in metrics_map:
 			metrics_map[dataset_name] = {}
@ -444,11 +449,11 @@ def main():
 	# collate entries into HTML
 	tables = []
 	for dataset_name, samples in outputs:
-		table = "\t\t<h3>${DATASET_NAME}</h3>\n\t\t<p><b>Average WER:</b> ${WER}<br><b>Average CER:</b> ${CER}<br><b>Average SIM-O:</b> ${SIM-O}<br></p>\n\t\t<table>\n\t\t\t<thead>\n\t\t\t\t<tr>\n\t\t\t\t\t<th>Text</th>\n\t\t\t\t\t<th>WER↓</th>\n\t\t\t\t\t<th>CER↓</th>\n\t\t\t\t\t<th>SIM-O↑</th>\n\t\t\t\t\t<th>Prompt</th>\n\t\t\t\t\t<th>Our VALL-E</th>\n\t\t\t\t\t<!--th>Original VALL-E</th-->\n\t\t\t\t\t<!--th>F5-TTS</th-->\n\t\t\t\t\t<th>Ground Truth</th>\n\t\t\t\t</tr>\n\t\t\t</thead>\n\t\t\t<tbody>${SAMPLES}</tbody>\n\t\t</table>"
+		table = "\t\t<h3>${DATASET_NAME}</h3>\n\t\t<p><b>Average WER:</b> ${WER}<br><b>Average CER:</b> ${CER}<br><b>Average PER:</b> ${PER}<br><b>Average SIM-O:</b> ${SIM-O}<br></p>\n\t\t<table>\n\t\t\t<thead>\n\t\t\t\t<tr>\n\t\t\t\t\t<th>Text</th>\n\t\t\t\t\t<th>WER↓</th>\n\t\t\t\t\t<th>CER↓</th>\n\t\t\t\t\t<th>SIM-O↑</th>\n\t\t\t\t\t<th>Prompt</th>\n\t\t\t\t\t<th>Our VALL-E</th>\n\t\t\t\t\t<!--th>Original VALL-E</th-->\n\t\t\t\t\t<!--th>F5-TTS</th-->\n\t\t\t\t\t<th>Ground Truth</th>\n\t\t\t\t</tr>\n\t\t\t</thead>\n\t\t\t<tbody>${SAMPLES}</tbody>\n\t\t</table>"
 		samples = [
 			f'\n\t\t\t<tr>\n\t\t\t\t<td>{text}</td>'+
 			"".join([
-				f'\n\t\t\t\t<td>{metrics_map[dataset_name][audios[1]][0]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][1]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][2]:.3f}</td>'
+				f'\n\t\t\t\t<td>{metrics_map[dataset_name][audios[1]][0]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][1]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][2]:.3f}</td><td>{metrics_map[dataset_name][audios[1]][3]:.3f}</td>'
 			] ) +
 			"".join( [
 				f'\n\t\t\t\t<td><audio controls="controls" preload="none"><source src="{str(audio).replace(str(args.demo_dir), args.audio_path_root) if args.audio_path_root else encode(audio)}"/></audio></td>'
@ -461,7 +466,8 @@ def main():
 		# write audio into template
 		table = table.replace("${WER}", f'{mean([ metrics[0] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
 		table = table.replace("${CER}", f'{mean([ metrics[1] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
-		table = table.replace("${SIM-O}", f'{mean([ metrics[2] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
+		table = table.replace("${PER}", f'{mean([ metrics[2] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
 		table = table.replace("${SIM-O}", f'{mean([ metrics[3] for metrics in metrics_map[dataset_name].values() ]):.3f}' )
 		table = table.replace("${DATASET_NAME}", dataset_name)
 		table = table.replace("${SAMPLES}", "\n".join( samples ) )
--- a/vall_e/metrics.py
+++ b/vall_e/metrics.py
@ -16,7 +16,7 @@ import warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 warnings.simplefilter(action='ignore', category=UserWarning)
-def wer( audio, reference, language="auto", phonemize=True, **transcription_kwargs ):
+def wer( audio, reference, language="auto", phonemize=True, normalize=True, **transcription_kwargs ):
 	if language == "auto":
 		language = detect_language( reference )
@ -38,7 +38,7 @@ def wer( audio, reference, language="auto", phonemize=True, **transcription_kwar
 	if phonemize:
 		transcription = encode( transcription, language=language )
 		reference = encode( reference, language=language )
-	else:
+	elif normalize:
 		transcription = normalize_text( transcription, language=language )
 		reference = normalize_text( reference, language=language )