mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-30 22:47:22 +02:00
* tests : add a new benchmark test for long-form audio Based on "Earnings-21" corpus by Del Rio et al. Earnings-21: A Practical Benchmark for ASR in the Wild (2021) https://arxiv.org/abs/2104.11348 This dataset contains 39 hours of long-form speech, sourced from public earning calls. Each recording contains roughly 50 minutes of English dialogues between multiple speakers (2-20 persons). This benchmark suite should allow us to evaluate the performance of whisper.cpp on long-form audio data. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * tests : apply PR feedback to 'earnings21/README.md' Based on feedback from Daniel Bevenius. - Simplify how to download & prepare a Silero VAD model. - Fix typo: inferece -> inference Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * tests : avoid crashing on non-UTF-8 characters Based on feedback from Daniel Bevenius. Add 'errors' parameter to open() in order to avoid unhandled exception on invalid UTF-8 bytes. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> * tests : try to interpret the hypothesis as Windows-1252 Based on the discussion in PR#3185. Evidently Whisper.cpp can represent a quotation mark as '0x93', which implifies Windows-1252 (Microsoft's ASCII excention), and cannot be decoded by UTF-8. Add an explicit decoding loop to address the issue. Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net> --------- Signed-off-by: Fujimoto Seiji <fujimoto@ceptord.net>
81 lines
2.0 KiB
Python
81 lines
2.0 KiB
Python
import re
|
|
import unicodedata
|
|
|
|
import regex
|
|
|
|
# non-ASCII letters that are not separated by "NFKD" normalization
|
|
ADDITIONAL_DIACRITICS = {
|
|
"œ": "oe",
|
|
"Œ": "OE",
|
|
"ø": "o",
|
|
"Ø": "O",
|
|
"æ": "ae",
|
|
"Æ": "AE",
|
|
"ß": "ss",
|
|
"ẞ": "SS",
|
|
"đ": "d",
|
|
"Đ": "D",
|
|
"ð": "d",
|
|
"Ð": "D",
|
|
"þ": "th",
|
|
"Þ": "th",
|
|
"ł": "l",
|
|
"Ł": "L",
|
|
}
|
|
|
|
|
|
def remove_symbols_and_diacritics(s: str, keep=""):
|
|
"""
|
|
Replace any other markers, symbols, and punctuations with a space,
|
|
and drop any diacritics (category 'Mn' and some manual mappings)
|
|
"""
|
|
return "".join(
|
|
(
|
|
c
|
|
if c in keep
|
|
else (
|
|
ADDITIONAL_DIACRITICS[c]
|
|
if c in ADDITIONAL_DIACRITICS
|
|
else (
|
|
""
|
|
if unicodedata.category(c) == "Mn"
|
|
else " " if unicodedata.category(c)[0] in "MSP" else c
|
|
)
|
|
)
|
|
)
|
|
for c in unicodedata.normalize("NFKD", s)
|
|
)
|
|
|
|
|
|
def remove_symbols(s: str):
|
|
"""
|
|
Replace any other markers, symbols, punctuations with a space, keeping diacritics
|
|
"""
|
|
return "".join(
|
|
" " if unicodedata.category(c)[0] in "MSP" else c
|
|
for c in unicodedata.normalize("NFKC", s)
|
|
)
|
|
|
|
|
|
class BasicTextNormalizer:
|
|
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
|
|
self.clean = (
|
|
remove_symbols_and_diacritics if remove_diacritics else remove_symbols
|
|
)
|
|
self.split_letters = split_letters
|
|
|
|
def __call__(self, s: str):
|
|
s = s.lower()
|
|
s = re.sub(r"[<\[][^>\]]*[>\]]", "", s) # remove words between brackets
|
|
s = re.sub(r"\(([^)]+?)\)", "", s) # remove words between parenthesis
|
|
s = self.clean(s).lower()
|
|
|
|
if self.split_letters:
|
|
s = " ".join(regex.findall(r"\X", s, regex.U))
|
|
|
|
s = re.sub(
|
|
r"\s+", " ", s
|
|
) # replace any successive whitespace characters with a space
|
|
|
|
return s
|