From b597c5a779e1086bded7d06e0f46112b2f688989 Mon Sep 17 00:00:00 2001 From: venkr Date: Mon, 6 Mar 2023 09:18:11 -0800 Subject: [PATCH] qual-bench.sh : add quality comparison tool, and update main.cpp to allow using a font file (#569) --- examples/main/main.cpp | 14 +++++++-- extra/qual-bench.sh | 67 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 extra/qual-bench.sh diff --git a/examples/main/main.cpp b/examples/main/main.cpp index cd7d9283..4118989b 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -80,6 +80,7 @@ struct whisper_params { std::string language = "en"; std::string prompt; + std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf"; std::string model = "models/ggml-base.en.bin"; std::vector fname_inp = {}; @@ -127,6 +128,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; } else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; } else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; } + else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; } else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; } else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } @@ -174,6 +176,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false"); fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false"); fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false"); + fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str()); fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false"); fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", ""); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); @@ -368,13 +371,18 @@ bool output_csv(struct whisper_context * ctx, const char * fname) { // karaoke video generation // outputs a bash script that uses ffmpeg to generate a video with the subtitles // TODO: font parameter adjustments -bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) { +bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) { std::ofstream fout(fname); fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); - // TODO: become parameter - static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf"; + static const char * font = params.font_path.c_str(); + + std::ifstream fin(font); + if (!fin.is_open()) { + fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font); + return false; + } fout << "#!/bin/bash" << "\n"; fout << "\n"; diff --git a/extra/qual-bench.sh b/extra/qual-bench.sh new file mode 100644 index 00000000..afb4ea5c --- /dev/null +++ b/extra/qual-bench.sh @@ -0,0 +1,67 @@ +# This script takes two arguments +# - an audio file +# - [optional] path to a font file + +# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu + +if [ -z "$1" ]; then + echo "Usage: $0