mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-30 14:38:53 +02:00
* tests : add script to benchmark whisper.cpp on LibriSpeech corpus LibriSpeech is a widely-used benchmark dataset for training and testing speech recognition models. This adds a set of scripts to measure the recognition accuracy of whisper.cpp models, following the common benchmark standards. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * Document how to prepare `whisper-cli` and model files Feedback from Daniel Bevenius. This adds a short code example how to prepare the `whisper-cli` command, to make the initial setup step a little bit clearer. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * tests : Simplify how to set up Python environment Based on a feedback from Georgi Gerganov. Instead of setting up a virtual environment in Makefile, let users set up the Python environment. This is better since users may have their own preferred workflow/toolkit. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> --------- Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net>
81 lines
2.0 KiB
Python
81 lines
2.0 KiB
Python
import re
|
|
import unicodedata
|
|
|
|
import regex
|
|
|
|
# non-ASCII letters that are not separated by "NFKD" normalization
|
|
ADDITIONAL_DIACRITICS = {
|
|
"œ": "oe",
|
|
"Œ": "OE",
|
|
"ø": "o",
|
|
"Ø": "O",
|
|
"æ": "ae",
|
|
"Æ": "AE",
|
|
"ß": "ss",
|
|
"ẞ": "SS",
|
|
"đ": "d",
|
|
"Đ": "D",
|
|
"ð": "d",
|
|
"Ð": "D",
|
|
"þ": "th",
|
|
"Þ": "th",
|
|
"ł": "l",
|
|
"Ł": "L",
|
|
}
|
|
|
|
|
|
def remove_symbols_and_diacritics(s: str, keep=""):
|
|
"""
|
|
Replace any other markers, symbols, and punctuations with a space,
|
|
and drop any diacritics (category 'Mn' and some manual mappings)
|
|
"""
|
|
return "".join(
|
|
(
|
|
c
|
|
if c in keep
|
|
else (
|
|
ADDITIONAL_DIACRITICS[c]
|
|
if c in ADDITIONAL_DIACRITICS
|
|
else (
|
|
""
|
|
if unicodedata.category(c) == "Mn"
|
|
else " " if unicodedata.category(c)[0] in "MSP" else c
|
|
)
|
|
)
|
|
)
|
|
for c in unicodedata.normalize("NFKD", s)
|
|
)
|
|
|
|
|
|
def remove_symbols(s: str):
|
|
"""
|
|
Replace any other markers, symbols, punctuations with a space, keeping diacritics
|
|
"""
|
|
return "".join(
|
|
" " if unicodedata.category(c)[0] in "MSP" else c
|
|
for c in unicodedata.normalize("NFKC", s)
|
|
)
|
|
|
|
|
|
class BasicTextNormalizer:
|
|
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
|
|
self.clean = (
|
|
remove_symbols_and_diacritics if remove_diacritics else remove_symbols
|
|
)
|
|
self.split_letters = split_letters
|
|
|
|
def __call__(self, s: str):
|
|
s = s.lower()
|
|
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
|
|
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
|
|
s = self.clean(s).lower()
|
|
|
|
if self.split_letters:
|
|
s = " ".join(regex.findall(r"\X", s, regex.U))
|
|
|
|
s = re.sub(
|
|
r"\s+", " ", s
|
|
) # replace any successive whitespace characters with a space
|
|
|
|
return s
|