mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-14 18:08:38 +01:00
qual-bench.sh : add quality comparison tool, and update main.cpp to allow using a font file (#569)
This commit is contained in:
parent
a3fb6c507f
commit
b597c5a779
@ -80,6 +80,7 @@ struct whisper_params {
|
||||
|
||||
std::string language = "en";
|
||||
std::string prompt;
|
||||
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
|
||||
std::vector<std::string> fname_inp = {};
|
||||
@ -127,6 +128,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
|
||||
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
||||
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
||||
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
||||
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
||||
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
@ -174,6 +176,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
||||
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
||||
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
||||
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
|
||||
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
||||
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
@ -368,13 +371,18 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
|
||||
// karaoke video generation
|
||||
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
|
||||
// TODO: font parameter adjustments
|
||||
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
|
||||
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
|
||||
std::ofstream fout(fname);
|
||||
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
// TODO: become parameter
|
||||
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
||||
static const char * font = params.font_path.c_str();
|
||||
|
||||
std::ifstream fin(font);
|
||||
if (!fin.is_open()) {
|
||||
fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
|
||||
return false;
|
||||
}
|
||||
|
||||
fout << "#!/bin/bash" << "\n";
|
||||
fout << "\n";
|
||||
|
67
extra/qual-bench.sh
Normal file
67
extra/qual-bench.sh
Normal file
@ -0,0 +1,67 @@
|
||||
# This script takes two arguments
|
||||
# - an audio file
|
||||
# - [optional] path to a font file
|
||||
|
||||
# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
echo "Usage: $0 <audio file> [font file]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
#TODO: Make this a command line parameter
|
||||
models="base small large"
|
||||
#models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
|
||||
|
||||
DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
|
||||
DURATION=$(printf "%.2f" $DURATION)
|
||||
echo "Input file duration: ${DURATION}s"
|
||||
|
||||
for model in $models; do
|
||||
echo "Running $model"
|
||||
COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
|
||||
|
||||
if [ ! -z "$2" ]; then
|
||||
COMMAND="$COMMAND -fp $2"
|
||||
fi
|
||||
#TODO: Surface errors better
|
||||
# TIMEFMT is for zsh, TIMEFORMAT is for bash
|
||||
EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
|
||||
|
||||
# Slightly different formats between zsh and bash
|
||||
if [ "${EXECTIME: -1}" == "s" ]; then
|
||||
EXECTIME=${EXECTIME::-1}
|
||||
fi
|
||||
|
||||
RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
|
||||
RATIO=$(printf "%.2f" $RATIO)
|
||||
|
||||
echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
|
||||
|
||||
# If the file already exists, delete it
|
||||
if [ -f $1.mp4 ]; then
|
||||
rm $1.$model.mp4
|
||||
fi
|
||||
|
||||
bash $1.$model.wts >/dev/null 2>&1
|
||||
mv $1.mp4 $1.$model.mp4
|
||||
|
||||
ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
|
||||
done
|
||||
|
||||
COMMAND="ffmpeg -y"
|
||||
for model in $models; do
|
||||
COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
|
||||
done
|
||||
COMMAND="$COMMAND -filter_complex \""
|
||||
COUNT=0
|
||||
for model in $models; do
|
||||
COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
|
||||
COUNT=$((COUNT+2))
|
||||
done
|
||||
COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
|
||||
|
||||
echo $COMMAND
|
||||
|
||||
# Run the command
|
||||
eval $COMMAND
|
Loading…
Reference in New Issue
Block a user