added text cleaning/normalization for wer purposes but it amounts to nothing desu

2024-12-18 19:58:53 -06:00 · 2024-12-18 19:58:53 -06:00 · 4775edaa41
commit 4775edaa41
parent 9f2bd7f6e4
3 changed files with 140 additions and 13 deletions
--- a/vall_e/data.py
+++ b/vall_e/data.py
@ -63,12 +63,133 @@ def sentence_split( s, split_by="sentences", quote_placeholder="<QUOTE>" ):
 	sentences = nltk.sent_tokenize(s)
 	return [ sentence.replace(quote_placeholder, '"') for sentence in sentences if sentence ]
-# to-do: improve upon this since it's kind of ass
+# normalization code borrowed from TorToiSe TTS
-# this might be better to live in emb.g2p
+# (it's not perfect but it works)
-def normalize_text( s ):
+
-	s = s.lower()
+try:
-	s = re.sub(r'[^\w\s]', '', s)
+	from tokenizers.normalizers import Lowercase, NFD, StripAccents
-	return s
+	
 	normalizer = tokenizers.normalizers.Sequence([Lowercase(), NFD(), StripAccents()])
 except Exception as e:
 	normalizer = None
 # List of (regular expression, replacement) pairs for abbreviations:
 _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
 	('mrs', 'misess'),
 	('mr', 'mister'),
 	('dr', 'doctor'),
 	('st', 'saint'),
 	('co', 'company'),
 	('jr', 'junior'),
 	('maj', 'major'),
 	('gen', 'general'),
 	('drs', 'doctors'),
 	('rev', 'reverend'),
 	('lt', 'lieutenant'),
 	('hon', 'honorable'),
 	('sgt', 'sergeant'),
 	('capt', 'captain'),
 	('esq', 'esquire'),
 	('ltd', 'limited'),
 	('col', 'colonel'),
 	('ft', 'fort'),
 ]]
 def normalize_abbreviations(text):
 	for regex, replacement in _abbreviations:
 		text = re.sub(regex, replacement, text)
 	return text
 def _remove_commas(m):
 	return m.group(1).replace(',', '')
 def _expand_decimal_point(m):
 	return m.group(1).replace('.', ' point ')
 def _expand_dollars(m):
 	match = m.group(1)
 	parts = match.split('.')
 	if len(parts) > 2:
 		return match + ' dollars' # Unexpected format
 	dollars = int(parts[0]) if parts[0] else 0
 	cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
 	if dollars and cents:
 		dollar_unit = 'dollar' if dollars == 1 else 'dollars'
 		cent_unit = 'cent' if cents == 1 else 'cents'
 		return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
 	elif dollars:
 		dollar_unit = 'dollar' if dollars == 1 else 'dollars'
 		return '%s %s' % (dollars, dollar_unit)
 	elif cents:
 		cent_unit = 'cent' if cents == 1 else 'cents'
 		return '%s %s' % (cents, cent_unit)
 	else:
 		return 'zero dollars'
 # in case the current env does not have it installed, so I don't need it as a hard dependency
 try:
 	import inflect
 	_inflect = inflect.engine()
 	def _expand_ordinal(m):
 		return _inflect.number_to_words(m.group(0))
 	def _expand_number(m):
 		num = int(m.group(0))
 		if num > 1000 and num < 3000:
 			if num == 2000:
 				return 'two thousand'
 			elif num > 2000 and num < 2010:
 				return 'two thousand ' + _inflect.number_to_words(num % 100)
 			elif num % 100 == 0:
 				return _inflect.number_to_words(num // 100) + ' hundred'
 			else:
 				return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
 		else:
 			return _inflect.number_to_words(num, andword='')
 except Exception as e:
 	_inflect = None
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
 _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
 _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
 _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
 _number_re = re.compile(r'[0-9]+')
 _whitespace_re = re.compile(r'\s+')
 _end_punct_re = re.compile(r'[\.\?\!]$')
 _aux_punct_re = re.compile(r'[,;:\?\.\!-]')
 def normalize_numbers(text):
 	text = re.sub(_comma_number_re, _remove_commas, text)
 	text = re.sub(_pounds_re, r'\1 pounds', text)
 	text = re.sub(_dollars_re, _expand_dollars, text)
 	text = re.sub(_decimal_number_re, _expand_decimal_point, text)
 	if _inflect is not None:
 		text = re.sub(_ordinal_re, _expand_ordinal, text)
 		text = re.sub(_number_re, _expand_number, text)
 	return text
 # full will do aggressive normalization, perfect for WER/CER
 # not full will do basic cleaning
 def normalize_text(text, language="auto", full=True):
 	if full:
 		if normalizer is not None:
 			text = normalizer.normalize_str( text )
 		else:
 			text = text.lower()
 		text = normalize_numbers(text) # expand numbers
 		text = normalize_abbreviations(text) # expand abbreviations
 		#text = re.sub(_end_punct_re, '', text) # collapse whitespace
 		text = re.sub(_aux_punct_re, '', text) # collapse whitespace
 		text = text.replace('"', '') # remove quotation marks
 	else:
 		text = normalize_numbers(text) # expand numbers
 		text = normalize_abbreviations(text) # expand abbreviations
 		text = re.sub(_whitespace_re, ' ', text) # collapse whitespace
 	# to-do: other languages
 	return text
@cache
 def get_random_prompts( validation=False, min_length=0, tokenized=False ):
--- a/vall_e/demo.py
+++ b/vall_e/demo.py
@ -135,7 +135,7 @@ def main():
 	parser.add_argument("--lora", action="store_true")
 	parser.add_argument("--comparison", type=str, default=None)
-	parser.add_argument("--transcription-model", type=str, default="openai/whisper-base")
+	parser.add_argument("--transcription-model", type=str, default="openai/whisper-large-v3")
 	parser.add_argument("--speaker-similarity-model", type=str, default="microsoft/wavlm-large")
 	args = parser.parse_args()
@ -426,7 +426,8 @@ def main():
 		calculate = not metrics_path.exists() or (metrics_path.stat().st_mtime < out_path.stat().st_mtime)
 		if calculate:
-			wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model )
+			wer_score, cer_score = wer( out_path, text, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=True )
 			#wer_score, cer_score = wer( out_path, reference_path, language=language, device=tts.device, dtype=tts.dtype, model_name=args.transcription_model, phonemize=False )
 			sim_o_score = sim_o( out_path, prompt_path, device=tts.device, dtype=tts.dtype, model_name=args.speaker_similarity_model )
 			metrics = {"wer": wer_score, "cer": cer_score, "sim-o": sim_o_score}
--- a/vall_e/metrics.py
+++ b/vall_e/metrics.py
@ -12,13 +12,19 @@ from pathlib import Path
 from torcheval.metrics.functional import word_error_rate
 from torchmetrics.functional.text import char_error_rate
-def wer( audio, reference, language="auto", normalize=True, phonemize=True, **transcription_kwargs ):
+import warnings
 warnings.simplefilter(action='ignore', category=FutureWarning)
 warnings.simplefilter(action='ignore', category=UserWarning)
 def wer( audio, reference, language="auto", phonemize=True, **transcription_kwargs ):
 	if language == "auto":
 		language = detect_language( reference )
 	transcription = transcribe( audio, language=language, align=False, **transcription_kwargs )
 	if language == "auto":
 		language = transcription["language"]
 	transcription = transcription["text"]
 	# reference audio needs transcribing too
@ -29,13 +35,12 @@ def wer( audio, reference, language="auto", normalize=True, phonemize=True, **tr
 		transcription = coerce_to_hiragana( transcription )
 		reference = coerce_to_hiragana( reference )
 	if normalize:
 		transcription = normalize_text( transcription )
 		reference = normalize_text( reference )
 	if phonemize:
 		transcription = encode( transcription, language=language )
 		reference = encode( reference, language=language )
 	else:
 		transcription = normalize_text( transcription, language=language )
 		reference = normalize_text( reference, language=language )
 	wer_score = word_error_rate([transcription], [reference]).item()
 	# un-normalize