2022-01-14 00:08:49 +00:00
|
|
|
import Levenshtein
|
|
|
|
from jiwer import wer, compute_measures
|
2021-10-26 19:30:29 +00:00
|
|
|
import torch
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
2021-12-31 23:21:39 +00:00
|
|
|
from data.audio.voice_tokenizer import VoiceBpeTokenizer
|
2021-10-26 19:30:29 +00:00
|
|
|
|
|
|
|
|
2021-12-31 23:21:39 +00:00
|
|
|
def load_truths(file):
|
|
|
|
niltok = VoiceBpeTokenizer(None)
|
|
|
|
out = {}
|
|
|
|
with open(file, 'r', encoding='utf-8') as f:
|
2022-01-14 00:08:49 +00:00
|
|
|
for line in f.readlines():
|
2021-12-31 23:21:39 +00:00
|
|
|
spl = line.split('|')
|
|
|
|
if len(spl) != 2:
|
2022-01-14 00:08:49 +00:00
|
|
|
print(spl)
|
2021-12-31 23:21:39 +00:00
|
|
|
continue
|
|
|
|
path, truth = spl
|
2022-01-14 00:08:49 +00:00
|
|
|
#path = path.replace('wav/', '')
|
|
|
|
# This preprocesses the truth data in the same way that training data is processed: removing punctuation, all lowercase, removing unnecessary
|
|
|
|
# whitespace, and applying "english cleaners", which convert words like "mrs" to "missus" and such.
|
|
|
|
truth = niltok.preprocess_text(truth)
|
2021-12-31 23:21:39 +00:00
|
|
|
out[path] = truth
|
|
|
|
return out
|
|
|
|
|
|
|
|
|
2021-10-26 19:30:29 +00:00
|
|
|
if __name__ == '__main__':
|
2021-12-31 23:21:39 +00:00
|
|
|
inference_tsv = 'results.tsv'
|
2022-01-14 00:08:49 +00:00
|
|
|
libri_base = 'y:\\bigasr_dataset/librispeech/test_clean/test_clean.txt'
|
2021-12-31 23:21:39 +00:00
|
|
|
|
|
|
|
# Pre-process truth values
|
|
|
|
truths = load_truths(libri_base)
|
2021-10-26 19:30:29 +00:00
|
|
|
|
2022-01-16 00:28:17 +00:00
|
|
|
niltok = VoiceBpeTokenizer(None)
|
2022-01-14 00:08:49 +00:00
|
|
|
ground_truths = []
|
|
|
|
hypotheses = []
|
2021-10-26 19:30:29 +00:00
|
|
|
with open(inference_tsv, 'r') as tsv_file:
|
|
|
|
tsv = tsv_file.read().splitlines()
|
|
|
|
for line in tqdm(tsv):
|
|
|
|
sentence_pred, wav = line.split('\t')
|
2022-01-16 00:28:17 +00:00
|
|
|
hypotheses.append(niltok.preprocess_text(sentence_pred))
|
2022-01-14 00:08:49 +00:00
|
|
|
ground_truths.append(truths[wav])
|
|
|
|
wer = wer(ground_truths, hypotheses)*100
|
|
|
|
print(f"WER: {wer}")
|