template cleanup

2024-12-11 20:06:55 -06:00 · 2024-12-11 20:06:55 -06:00 · 0c69e798f7
commit 0c69e798f7
parent 7e54e897f7
3 changed files with 10 additions and 4 deletions
--- a/data/demo/index.template.html
+++ b/data/demo/index.template.html
@ -11,6 +11,7 @@
 				<tr>
 					<th>Text</th>
 					<th>WER↓</th>
+					<th>CER↓</th>
 					<th>SIM-O↑</th>
 					<th>Prompt</th>
 					<th>Our VALL-E</th>
@ -27,6 +28,7 @@
 				<tr>
 					<th>Text</th>
 					<th>WER↓</th>
+					<th>CER↓</th>
 					<th>SIM-O↑</th>
 					<th>Prompt</th>
 					<th>Our VALL-E</th>
--- a/vall_e/demo.py
+++ b/vall_e/demo.py
@ -155,9 +155,9 @@ def main():
 			'Below are some samples from my VALL-E implementation: <a href="https://git.ecker.tech/mrq/vall-e/">https://git.ecker.tech/mrq/vall-e/</a>.',
 			'Unlike the original VALL-E demo page, I\'m placing emphasis on the input prompt, as the model adheres to it stronger than others.',
 			f'Objective metrics are computed by transcribing ({args.transcription_model}) then comparing the word error rate on transcriptions (WER/CER), and computing the cosine similarities on embeddings through a speaker feature extraction model ({args.speaker_similarity_model}) (SIM-O)',
-			'<b>Total WER:</b> ${WER}'
-			'<b>Total CER:</b> ${CER}'
-			'<b>Total SIM-O:</b> ${SIM-O}'
+			'<b>Total WER:</b> ${WER}<br>'
+			'<b>Total CER:</b> ${CER}<br>'
+			'<b>Total SIM-O:</b> ${SIM-O}<br>'
 		])

 	# comparison kwargs
--- a/vall_e/emb/transcribe.py
+++ b/vall_e/emb/transcribe.py
@ -9,12 +9,14 @@ import argparse
 import torch
 import torchaudio

+"""
 try:
 	import whisperx
 except Exception as e:
 	whisperx = None
 	print(f"Error while querying for whisperx: {str(e)}")
 	pass
+"""

 from transformers import pipeline

@ -193,6 +195,7 @@ def transcribe(
 	return metadata

 # for backwards compat since it also handles some other things for me
+"""
 def transcribe_whisperx(
 	audio,
 	language = "auto",
@ -248,6 +251,7 @@ def transcribe_whisperx(
 	metadata["end"] = end

 	return metadata
+"""

 def transcribe_batch(
 	input_audio = "voices",
@ -315,7 +319,7 @@ def transcribe_batch(
 				if os.path.isdir(inpath):
 					continue

-				metadata[filename] = transcribe_whisperx( inpath, model_name=model_name, diarize=diarize, device=device, dtype=dtype )
+				metadata[filename] = transcribe( inpath, model_name=model_name, diarize=diarize, device=device, dtype=dtype )

 				open(outpath, 'w', encoding='utf-8').write(json.dumps(metadata))