DL-Art-School/codes/scripts/audio/word_error_rate.py

45 lines
1.5 KiB
Python
Raw Normal View History

2022-01-14 00:08:49 +00:00
import Levenshtein
from jiwer import wer, compute_measures
2021-10-26 19:30:29 +00:00
import torch
from tqdm import tqdm
2021-12-31 23:21:39 +00:00
from data.audio.voice_tokenizer import VoiceBpeTokenizer
2021-10-26 19:30:29 +00:00
2021-12-31 23:21:39 +00:00
def load_truths(file):
niltok = VoiceBpeTokenizer(None)
out = {}
with open(file, 'r', encoding='utf-8') as f:
2022-01-14 00:08:49 +00:00
for line in f.readlines():
2021-12-31 23:21:39 +00:00
spl = line.split('|')
if len(spl) != 2:
2022-01-14 00:08:49 +00:00
print(spl)
2021-12-31 23:21:39 +00:00
continue
path, truth = spl
2022-01-14 00:08:49 +00:00
#path = path.replace('wav/', '')
# This preprocesses the truth data in the same way that training data is processed: removing punctuation, all lowercase, removing unnecessary
# whitespace, and applying "english cleaners", which convert words like "mrs" to "missus" and such.
truth = niltok.preprocess_text(truth)
2021-12-31 23:21:39 +00:00
out[path] = truth
return out
2021-10-26 19:30:29 +00:00
if __name__ == '__main__':
2021-12-31 23:21:39 +00:00
inference_tsv = 'results.tsv'
2022-01-14 00:08:49 +00:00
libri_base = 'y:\\bigasr_dataset/librispeech/test_clean/test_clean.txt'
2021-12-31 23:21:39 +00:00
# Pre-process truth values
truths = load_truths(libri_base)
2021-10-26 19:30:29 +00:00
2022-01-16 00:28:17 +00:00
niltok = VoiceBpeTokenizer(None)
2022-01-14 00:08:49 +00:00
ground_truths = []
hypotheses = []
2021-10-26 19:30:29 +00:00
with open(inference_tsv, 'r') as tsv_file:
tsv = tsv_file.read().splitlines()
for line in tqdm(tsv):
sentence_pred, wav = line.split('\t')
2022-01-16 00:28:17 +00:00
hypotheses.append(niltok.preprocess_text(sentence_pred))
2022-01-14 00:08:49 +00:00
ground_truths.append(truths[wav])
wer = wer(ground_truths, hypotheses)*100
print(f"WER: {wer}")