vall-e/vall_e/metrics.py

# handles objective metric calculations, such as WER and SIM-O

#from .emb.transcribe import transcribe
from .emb.similar import speaker_similarity_embedding
from .emb.transcribe import transcribe
from .emb.g2p import detect_language, coerce_to_hiragana, encode
from .data import normalize_text

import torch.nn.functional as F

from pathlib import Path
from torcheval.metrics.functional import word_error_rate
from torchmetrics.functional.text import char_error_rate

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

def wer( audio, reference, language="auto", phonemize=True, normalize=True, **transcription_kwargs ):
	if language == "auto":
		language = detect_language( reference )

	transcription = transcribe( audio, language=language, align=False, **transcription_kwargs )
	
	if language == "auto":
		language = transcription["language"]
	
	transcription = transcription["text"]

	# reference audio needs transcribing too
	if isinstance( reference, Path ):
		reference = transcribe( reference, language=language, align=False, **transcription_kwargs )["text"]

	if language == "ja":
		transcription = coerce_to_hiragana( transcription )
		reference = coerce_to_hiragana( reference )

	if phonemize:
		transcription = encode( transcription, language=language )
		reference = encode( reference, language=language )
	elif normalize:
		transcription = normalize_text( transcription, language=language )
		reference = normalize_text( reference, language=language )

	wer_score = word_error_rate([transcription], [reference]).item()
	# un-normalize
	wer_score *= len(reference.split())
	
	cer_score = char_error_rate([transcription], [reference]).item()
	# un-normalize
	cer_score *= len(reference)

	return wer_score, cer_score

def sim_o( audio, reference, **kwargs ):
	audio_emb = speaker_similarity_embedding( audio, **kwargs )
	reference_emb = speaker_similarity_embedding( reference, **kwargs )

	return F.cosine_similarity( audio_emb, reference_emb, dim=-1 ).item()
added WER/SIM-O metrics, added APOLLO but I need to test it 2024-12-11 02:13:21 +00:00			`# handles objective metric calculations, such as WER and SIM-O`

			`#from .emb.transcribe import transcribe`
			`from .emb.similar import speaker_similarity_embedding`
			`from .emb.transcribe import transcribe`
Added CER, transcription/similarity model args in demo 2024-12-11 03:00:51 +00:00			`from .emb.g2p import detect_language, coerce_to_hiragana, encode`
added WER/SIM-O metrics, added APOLLO but I need to test it 2024-12-11 02:13:21 +00:00			`from .data import normalize_text`

			`import torch.nn.functional as F`

			`from pathlib import Path`
			`from torcheval.metrics.functional import word_error_rate`
actually do proper wer/cer calculation by un-normalizing the scores 2024-12-17 20:22:30 +00:00			`from torchmetrics.functional.text import char_error_rate`
added WER/SIM-O metrics, added APOLLO but I need to test it 2024-12-11 02:13:21 +00:00
added text cleaning/normalization for wer purposes but it amounts to nothing desu 2024-12-19 01:58:53 +00:00			`import warnings`
			`warnings.simplefilter(action='ignore', category=FutureWarning)`
			`warnings.simplefilter(action='ignore', category=UserWarning)`

instead just compute a bunch of stuff on the transcriptions to store later in different names so I can just retrieve what I want, also added tongue twisters for nefarious reasons 2024-12-19 05:43:11 +00:00			`def wer( audio, reference, language="auto", phonemize=True, normalize=True, **transcription_kwargs ):`
added WER/SIM-O metrics, added APOLLO but I need to test it 2024-12-11 02:13:21 +00:00			`if language == "auto":`
			`language = detect_language( reference )`

Added CER, transcription/similarity model args in demo 2024-12-11 03:00:51 +00:00			`transcription = transcribe( audio, language=language, align=False, **transcription_kwargs )`
added text cleaning/normalization for wer purposes but it amounts to nothing desu 2024-12-19 01:58:53 +00:00
Added CER, transcription/similarity model args in demo 2024-12-11 03:00:51 +00:00			`if language == "auto":`
			`language = transcription["language"]`
added text cleaning/normalization for wer purposes but it amounts to nothing desu 2024-12-19 01:58:53 +00:00
Added CER, transcription/similarity model args in demo 2024-12-11 03:00:51 +00:00			`transcription = transcription["text"]`
added WER/SIM-O metrics, added APOLLO but I need to test it 2024-12-11 02:13:21 +00:00
			`# reference audio needs transcribing too`
			`if isinstance( reference, Path ):`
			`reference = transcribe( reference, language=language, align=False, **transcription_kwargs )["text"]`

Added CER, transcription/similarity model args in demo 2024-12-11 03:00:51 +00:00			`if language == "ja":`
			`transcription = coerce_to_hiragana( transcription )`
			`reference = coerce_to_hiragana( reference )`

			`if phonemize:`
			`transcription = encode( transcription, language=language )`
			`reference = encode( reference, language=language )`
instead just compute a bunch of stuff on the transcriptions to store later in different names so I can just retrieve what I want, also added tongue twisters for nefarious reasons 2024-12-19 05:43:11 +00:00			`elif normalize:`
added text cleaning/normalization for wer purposes but it amounts to nothing desu 2024-12-19 01:58:53 +00:00			`transcription = normalize_text( transcription, language=language )`
			`reference = normalize_text( reference, language=language )`
added WER/SIM-O metrics, added APOLLO but I need to test it 2024-12-11 02:13:21 +00:00
Added CER, transcription/similarity model args in demo 2024-12-11 03:00:51 +00:00			`wer_score = word_error_rate([transcription], [reference]).item()`
actually do proper wer/cer calculation by un-normalizing the scores 2024-12-17 20:22:30 +00:00			`# un-normalize`
			`wer_score *= len(reference.split())`

			`cer_score = char_error_rate([transcription], [reference]).item()`
			`# un-normalize`
			`cer_score *= len(reference)`

Added CER, transcription/similarity model args in demo 2024-12-11 03:00:51 +00:00			`return wer_score, cer_score`
added WER/SIM-O metrics, added APOLLO but I need to test it 2024-12-11 02:13:21 +00:00
			`def sim_o( audio, reference, **kwargs ):`
			`audio_emb = speaker_similarity_embedding( audio, **kwargs )`
			`reference_emb = speaker_similarity_embedding( reference, **kwargs )`

uplifting transformer's WavLM stuff to do speaker verification instead 2024-12-12 01:30:05 +00:00			`return F.cosine_similarity( audio_emb, reference_emb, dim=-1 ).item()`