mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-06-30 06:20:07 +02:00
tests : add script to benchmark whisper.cpp on LibriSpeech corpus (#2999)
* tests : add script to benchmark whisper.cpp on LibriSpeech corpus LibriSpeech is a widely-used benchmark dataset for training and testing speech recognition models. This adds a set of scripts to measure the recognition accuracy of whisper.cpp models, following the common benchmark standards. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * Document how to prepare `whisper-cli` and model files Feedback from Daniel Bevenius. This adds a short code example how to prepare the `whisper-cli` command, to make the initial setup step a little bit clearer. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * tests : Simplify how to set up Python environment Based on a feedback from Georgi Gerganov. Instead of setting up a virtual environment in Makefile, let users set up the Python environment. This is better since users may have their own preferred workflow/toolkit. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> --------- Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net>
This commit is contained in:
47
tests/librispeech/eval.py
Normal file
47
tests/librispeech/eval.py
Normal file
@ -0,0 +1,47 @@
|
||||
import os
|
||||
import glob
|
||||
import jiwer
|
||||
from normalizers import EnglishTextNormalizer
|
||||
|
||||
def get_reference():
|
||||
ref = {}
|
||||
for path in glob.glob('LibriSpeech/*/*/*/*.trans.txt'):
|
||||
with open(path) as fp:
|
||||
for line in fp:
|
||||
code, text = line.strip().split(" ", maxsplit=1)
|
||||
ref [code] = text
|
||||
return ref
|
||||
|
||||
def get_hypothesis():
|
||||
hyp = {}
|
||||
for path in glob.glob('LibriSpeech/*/*/*/*.flac.txt'):
|
||||
with open(path) as fp:
|
||||
text = fp.read().strip()
|
||||
code = os.path.basename(path).replace('.flac.txt', '')
|
||||
hyp[code] = text
|
||||
return hyp
|
||||
|
||||
def get_codes():
|
||||
codes = []
|
||||
for path in glob.glob('LibriSpeech/*/*/*/*.flac'):
|
||||
codes.append(os.path.basename(path).replace('.flac', ''))
|
||||
return sorted(codes)
|
||||
|
||||
def main():
|
||||
normalizer = EnglishTextNormalizer()
|
||||
|
||||
ref_orig = get_reference()
|
||||
hyp_orig = get_hypothesis()
|
||||
|
||||
ref_clean = []
|
||||
hyp_clean = []
|
||||
|
||||
for code in get_codes():
|
||||
ref_clean.append(normalizer(ref_orig[code]))
|
||||
hyp_clean.append(normalizer(hyp_orig[code]))
|
||||
|
||||
wer = jiwer.wer(ref_clean, hyp_clean)
|
||||
print(f"WER: {wer * 100:.2f}%")
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Reference in New Issue
Block a user