forked from extern/whisper.cpp
Compare commits
11 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
2c3f50a021 | ||
|
9a65269a20 | ||
|
78f166174f | ||
|
21c569ba4a | ||
|
1a91c19af9 | ||
|
f583e2d2f5 | ||
|
206fc93396 | ||
|
a6cf6f4c4a | ||
|
472a473fd1 | ||
|
9ba66c2fad | ||
|
1ccb8a46a5 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -18,6 +18,7 @@ build-sanitize-thread/
|
||||
/talk
|
||||
/bench
|
||||
|
||||
arm_neon.h
|
||||
sync.sh
|
||||
libwhisper.a
|
||||
libwhisper.so
|
||||
|
@@ -1,6 +1,6 @@
|
||||
cmake_minimum_required (VERSION 3.0)
|
||||
|
||||
project(whisper.cpp VERSION 1.1.0)
|
||||
project(whisper.cpp VERSION 1.1.1)
|
||||
|
||||
# Add path to modules
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
@@ -4,7 +4,7 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Stable: [v1.0.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.0.4) / Beta: [v1.1.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.1.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
Stable: [v1.1.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.1.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
|
Submodule bindings/ios updated: f6334b026f...9653b42eb4
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.1.0",
|
||||
"version": "1.1.1",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
File diff suppressed because one or more lines are too long
@@ -1,11 +1,8 @@
|
||||
#include "ggml.h"
|
||||
#include "whisper.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstring>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
@@ -53,7 +50,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
int bench_whisper_encoder(const whisper_params & params) {
|
||||
int whisper_bench_encoder(const whisper_params & params) {
|
||||
// whisper init
|
||||
|
||||
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
||||
@@ -96,132 +93,6 @@ int bench_whisper_encoder(const whisper_params & params) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bench_memcpy(const whisper_params & params) {
|
||||
size_t n = 50;
|
||||
size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
|
||||
|
||||
// 1 GB array
|
||||
const size_t size = arr*1024llu*1024llu;
|
||||
|
||||
char * src = (char *) malloc(size);
|
||||
char * dst = (char *) malloc(size);
|
||||
|
||||
for (size_t i = 0; i < size; i++) src[i] = i;
|
||||
|
||||
memcpy(dst, src, size); // heat-up
|
||||
|
||||
double tsum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
const int64_t t0 = ggml_time_us();
|
||||
|
||||
memcpy(dst, src, size);
|
||||
|
||||
const int64_t t1 = ggml_time_us();
|
||||
|
||||
tsum += (t1 - t0)*1e-6;
|
||||
|
||||
src[0] = rand();
|
||||
}
|
||||
|
||||
fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
||||
|
||||
// needed to prevent the compile from optimizing the memcpy away
|
||||
{
|
||||
double sum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < size; i++) sum += dst[i];
|
||||
|
||||
fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
|
||||
}
|
||||
|
||||
free(src);
|
||||
free(dst);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bench_ggml_mul_mat(const whisper_params & params) {
|
||||
const int n_max = 128;
|
||||
|
||||
const std::vector<size_t> sizes = {
|
||||
64, 128, 256, 512, 1024, 2048, 4096,
|
||||
};
|
||||
|
||||
const size_t N_max = sizes.back();
|
||||
|
||||
// a: N*N*sizeof(float)
|
||||
// b: N*N*sizeof(float)
|
||||
// c: N*N*sizeof(float)
|
||||
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
||||
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
|
||||
|
||||
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
||||
|
||||
for (int j = 0; j < (int) sizes.size(); j++) {
|
||||
int n_fp16 = 0;
|
||||
int n_fp32 = 0;
|
||||
|
||||
// GFLOPS/s
|
||||
double s_fp16 = 0.0;
|
||||
double s_fp32 = 0.0;
|
||||
|
||||
const size_t N = sizes[j];
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
double & s = k == 0 ? s_fp16 : s_fp32;
|
||||
int & n = k == 0 ? n_fp16 : n_fp32;
|
||||
|
||||
struct ggml_init_params gparams = {
|
||||
/*.mem_size =*/ buf.size(),
|
||||
/*.mem_buffer =*/ buf.data(),
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(gparams);
|
||||
|
||||
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
|
||||
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
|
||||
|
||||
struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(c);
|
||||
|
||||
gf.n_threads = params.n_threads;
|
||||
|
||||
double tsum = 0.0;
|
||||
|
||||
// heat-up
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
for (int i = 0; i < n_max; ++i) {
|
||||
const int64_t t0 = ggml_time_us();
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
const int64_t t1 = ggml_time_us();
|
||||
|
||||
tsum += (t1 - t0)*1e-6;
|
||||
n++;
|
||||
|
||||
if (tsum > 1.0 && n >= 3) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
||||
}
|
||||
|
||||
fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
|
||||
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
|
||||
@@ -229,14 +100,12 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
ggml_time_init();
|
||||
|
||||
int ret = -1;
|
||||
|
||||
switch (params.what) {
|
||||
case 0: ret = bench_whisper_encoder(params); break;
|
||||
case 1: ret = bench_memcpy(params); break;
|
||||
case 2: ret = bench_ggml_mul_mat(params); break;
|
||||
case 0: ret = whisper_bench_encoder(params); break;
|
||||
case 1: ret = whisper_bench_memcpy(params.n_threads); break;
|
||||
case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
|
||||
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
|
||||
}
|
||||
|
||||
|
@@ -84,6 +84,7 @@ struct whisper_params {
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
|
||||
std::vector<std::string> fname_inp = {};
|
||||
std::vector<std::string> fname_outp = {};
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
@@ -121,6 +122,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
|
||||
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
|
||||
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
|
||||
else if (arg == "-of" || arg == "--output-file") { params.fname_outp.emplace_back(argv[++i]); }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
||||
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
||||
@@ -144,35 +146,36 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
||||
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
||||
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
||||
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
||||
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
||||
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
||||
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
||||
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
||||
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
||||
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
||||
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
||||
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
||||
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
||||
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
||||
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
||||
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
||||
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
||||
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
||||
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
||||
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
||||
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
||||
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
||||
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
||||
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
||||
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
||||
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
||||
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
||||
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
||||
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
||||
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
|
||||
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
|
||||
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
|
||||
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
|
||||
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
|
||||
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
||||
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
||||
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
||||
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@@ -514,6 +517,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
@@ -654,7 +658,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
wparams.temperature_inc = -1;
|
||||
|
||||
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
|
||||
@@ -692,31 +695,31 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// output to text file
|
||||
if (params.output_txt) {
|
||||
const auto fname_txt = fname_inp + ".txt";
|
||||
const auto fname_txt = fname_outp + ".txt";
|
||||
output_txt(ctx, fname_txt.c_str());
|
||||
}
|
||||
|
||||
// output to VTT file
|
||||
if (params.output_vtt) {
|
||||
const auto fname_vtt = fname_inp + ".vtt";
|
||||
const auto fname_vtt = fname_outp + ".vtt";
|
||||
output_vtt(ctx, fname_vtt.c_str());
|
||||
}
|
||||
|
||||
// output to SRT file
|
||||
if (params.output_srt) {
|
||||
const auto fname_srt = fname_inp + ".srt";
|
||||
const auto fname_srt = fname_outp + ".srt";
|
||||
output_srt(ctx, fname_srt.c_str(), params);
|
||||
}
|
||||
|
||||
// output to WTS file
|
||||
if (params.output_wts) {
|
||||
const auto fname_wts = fname_inp + ".wts";
|
||||
const auto fname_wts = fname_outp + ".wts";
|
||||
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
|
||||
}
|
||||
|
||||
// output to CSV file
|
||||
if (params.output_csv) {
|
||||
const auto fname_csv = fname_inp + ".csv";
|
||||
const auto fname_csv = fname_outp + ".csv";
|
||||
output_csv(ctx, fname_csv.c_str());
|
||||
}
|
||||
|
||||
|
@@ -423,7 +423,8 @@ int main(int argc, char ** argv) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
params.keep_ms = std::min(params.keep_ms, params.step_ms); // cannot be more than step_ms
|
||||
params.keep_ms = std::min(params.keep_ms, params.step_ms);
|
||||
params.length_ms = std::max(params.length_ms, params.step_ms);
|
||||
|
||||
const int n_samples_step = (params.step_ms *1e-3)*WHISPER_SAMPLE_RATE;
|
||||
const int n_samples_len = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
|
||||
@@ -432,7 +433,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
|
||||
|
||||
const int n_new_line = !use_vad ? params.length_ms / params.step_ms - 1 : 1; // number of steps to print new line
|
||||
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
|
||||
|
||||
params.no_timestamps = !use_vad;
|
||||
params.no_context |= use_vad;
|
||||
|
@@ -32,8 +32,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE=8 \
|
||||
-s INITIAL_MEMORY=1024MB \
|
||||
-s TOTAL_MEMORY=1024MB \
|
||||
-s INITIAL_MEMORY=1500MB \
|
||||
-s TOTAL_MEMORY=1500MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
|
@@ -46,10 +46,12 @@
|
||||
|
||||
<div id="model">
|
||||
Whisper model: <span id="model-whisper-status"></span>
|
||||
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
||||
<button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
|
||||
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
||||
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
|
||||
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
|
||||
<button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
|
||||
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
|
||||
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
|
||||
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
|
||||
<button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button>
|
||||
<span id="fetch-whisper-progress"></span>
|
||||
|
||||
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
|
||||
@@ -284,27 +286,33 @@
|
||||
}
|
||||
reader.readAsArrayBuffer(file);
|
||||
|
||||
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
||||
document.getElementById('whisper-file' ).style.display = 'none';
|
||||
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
|
||||
document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-small-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-small' ).style.display = 'none';
|
||||
document.getElementById('whisper-file' ).style.display = 'none';
|
||||
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
|
||||
}
|
||||
|
||||
function loadWhisper(model) {
|
||||
let urls = {
|
||||
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
||||
'tiny': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
|
||||
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
||||
'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
|
||||
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
|
||||
'tiny': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
|
||||
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
|
||||
'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
|
||||
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
|
||||
'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
|
||||
};
|
||||
|
||||
let sizes = {
|
||||
'tiny.en': 75,
|
||||
'tiny': 75,
|
||||
'base.en': 142,
|
||||
'base': 142,
|
||||
'tiny.en': 75,
|
||||
'tiny': 75,
|
||||
'base.en': 142,
|
||||
'base': 142,
|
||||
'small.en': 466,
|
||||
'small': 466,
|
||||
};
|
||||
|
||||
let url = urls[model];
|
||||
@@ -313,12 +321,14 @@
|
||||
|
||||
model_whisper = model;
|
||||
|
||||
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
||||
document.getElementById('whisper-file' ).style.display = 'none';
|
||||
document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
|
||||
document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-small-en').style.display = 'none';
|
||||
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-base' ).style.display = 'none';
|
||||
document.getElementById('fetch-whisper-small' ).style.display = 'none';
|
||||
document.getElementById('whisper-file' ).style.display = 'none';
|
||||
document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
|
||||
|
||||
cbProgress = function(p) {
|
||||
let el = document.getElementById('fetch-whisper-progress');
|
||||
@@ -327,12 +337,14 @@
|
||||
|
||||
cbCancel = function() {
|
||||
var el;
|
||||
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
|
||||
el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
|
||||
el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
|
||||
};
|
||||
|
||||
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
|
||||
|
@@ -19,7 +19,7 @@ printf "\n"
|
||||
./bench -w 1 -t 1 2>&1
|
||||
|
||||
printf "\n"
|
||||
printf "Running ggml_mul_mat benchmark with " $n_threads " threads\n"
|
||||
printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
|
||||
printf "\n"
|
||||
|
||||
./bench -w 2 -t $n_threads 2>&1
|
||||
|
171
whisper.cpp
171
whisper.cpp
@@ -474,6 +474,12 @@ struct whisper_context {
|
||||
int64_t t_decode_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
int32_t n_sample = 0; // number of tokens sampled
|
||||
int32_t n_encode = 0; // number of encoder calls
|
||||
int32_t n_decode = 0; // number of decoder calls
|
||||
int32_t n_fail_p = 0; // number of logprob threshold failures
|
||||
int32_t n_fail_h = 0; // number of entropy threshold failures
|
||||
|
||||
ggml_type wtype; // weight type (FP32 or FP16)
|
||||
|
||||
whisper_mel mel;
|
||||
@@ -1620,6 +1626,7 @@ static bool whisper_encode(
|
||||
ggml_free(ctx0);
|
||||
|
||||
wctx.t_encode_us += ggml_time_us() - t_start_us;
|
||||
wctx.n_encode++;
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -1993,6 +2000,7 @@ static bool whisper_decode(
|
||||
ggml_free(ctx0);
|
||||
|
||||
wctx.t_decode_us += ggml_time_us() - t_start_us;
|
||||
wctx.n_decode++;
|
||||
|
||||
return true;
|
||||
}
|
||||
@@ -2644,12 +2652,17 @@ whisper_token whisper_token_transcribe(void) {
|
||||
void whisper_print_timings(struct whisper_context * ctx) {
|
||||
const int64_t t_end_us = ggml_time_us();
|
||||
|
||||
const int32_t n_sample = std::max(1, ctx->n_sample);
|
||||
const int32_t n_encode = std::max(1, ctx->n_encode);
|
||||
const int32_t n_decode = std::max(1, ctx->n_decode);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "%s: fallbacks = %3d p / %3d h\n", __func__, ctx->n_fail_p, ctx->n_fail_h);
|
||||
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
|
||||
fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
|
||||
fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
|
||||
fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
|
||||
fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
|
||||
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_sample_us, n_sample, 1e-3f*ctx->t_sample_us/n_sample);
|
||||
fprintf(stderr, "%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_encode_us, n_encode, 1e-3f*ctx->t_encode_us/n_encode);
|
||||
fprintf(stderr, "%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_decode_us, n_decode, 1e-3f*ctx->t_decode_us/n_decode);
|
||||
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
|
||||
}
|
||||
|
||||
@@ -3004,7 +3017,7 @@ static void whisper_process_logits(
|
||||
}
|
||||
|
||||
static whisper_token_data whisper_sample_token(
|
||||
const whisper_context & ctx,
|
||||
whisper_context & ctx,
|
||||
const whisper_decoder & decoder,
|
||||
bool best) {
|
||||
whisper_token_data result = {
|
||||
@@ -3059,6 +3072,8 @@ static whisper_token_data whisper_sample_token(
|
||||
result.pt = result.p;
|
||||
}
|
||||
|
||||
ctx.n_sample++;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -3127,6 +3142,8 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
|
||||
}
|
||||
}
|
||||
|
||||
ctx.n_sample++;
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@@ -3432,7 +3449,7 @@ int whisper_full(
|
||||
prompt.clear();
|
||||
|
||||
// if we have already generated some text, use it as a prompt to condition the next generation
|
||||
if (!prompt_past.empty() && t_cur > 0.5f) {
|
||||
if (!prompt_past.empty() && t_cur < 0.5f) {
|
||||
int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
|
||||
|
||||
prompt = { whisper_token_prev(ctx) };
|
||||
@@ -3721,11 +3738,12 @@ int whisper_full(
|
||||
WHISPER_PRINT_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
|
||||
__func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
|
||||
|
||||
if (decoder.sequence.result_len > 8 && decoder.sequence.entropy < params.entropy_thold) {
|
||||
if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
|
||||
WHISPER_PRINT_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
|
||||
__func__, j, decoder.sequence.entropy, params.entropy_thold);
|
||||
|
||||
decoder.failed = true;
|
||||
ctx->n_fail_h++;
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -3747,6 +3765,7 @@ int whisper_full(
|
||||
|
||||
if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
|
||||
success = false;
|
||||
ctx->n_fail_p++;
|
||||
}
|
||||
|
||||
if (success) {
|
||||
@@ -3801,6 +3820,7 @@ int whisper_full(
|
||||
|
||||
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
|
||||
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
|
||||
|
||||
if (!text.empty()) {
|
||||
const auto tt0 = params.speed_up ? 2*t0 : t0;
|
||||
const auto tt1 = params.speed_up ? 2*t1 : t1;
|
||||
@@ -4059,6 +4079,145 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
//
|
||||
// Temporary interface needed for exposing ggml interface
|
||||
// Will be removed in the future when ggml becomes a separate library
|
||||
//
|
||||
|
||||
WHISPER_API int whisper_bench_memcpy(int n_threads) {
|
||||
ggml_time_init();
|
||||
|
||||
size_t n = 50;
|
||||
size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
|
||||
|
||||
// 1 GB array
|
||||
const size_t size = arr*1024llu*1024llu;
|
||||
|
||||
char * src = (char *) malloc(size);
|
||||
char * dst = (char *) malloc(size);
|
||||
|
||||
for (size_t i = 0; i < size; i++) src[i] = i;
|
||||
|
||||
memcpy(dst, src, size); // heat-up
|
||||
|
||||
double tsum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
const int64_t t0 = ggml_time_us();
|
||||
|
||||
memcpy(dst, src, size);
|
||||
|
||||
const int64_t t1 = ggml_time_us();
|
||||
|
||||
tsum += (t1 - t0)*1e-6;
|
||||
|
||||
src[0] = rand();
|
||||
}
|
||||
|
||||
fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
||||
|
||||
// needed to prevent the compile from optimizing the memcpy away
|
||||
{
|
||||
double sum = 0.0;
|
||||
|
||||
for (size_t i = 0; i < size; i++) sum += dst[i];
|
||||
|
||||
fprintf(stderr, "sum: %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
|
||||
}
|
||||
|
||||
free(src);
|
||||
free(dst);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
|
||||
ggml_time_init();
|
||||
|
||||
const int n_max = 128;
|
||||
|
||||
const std::vector<size_t> sizes = {
|
||||
64, 128, 256, 512, 1024, 2048, 4096,
|
||||
};
|
||||
|
||||
const size_t N_max = sizes.back();
|
||||
|
||||
// a: N*N*sizeof(float)
|
||||
// b: N*N*sizeof(float)
|
||||
// c: N*N*sizeof(float)
|
||||
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
|
||||
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
|
||||
|
||||
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
|
||||
|
||||
for (int j = 0; j < (int) sizes.size(); j++) {
|
||||
int n_fp16 = 0;
|
||||
int n_fp32 = 0;
|
||||
|
||||
// GFLOPS/s
|
||||
double s_fp16 = 0.0;
|
||||
double s_fp32 = 0.0;
|
||||
|
||||
const size_t N = sizes[j];
|
||||
|
||||
for (int k = 0; k < 2; ++k) {
|
||||
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
double & s = k == 0 ? s_fp16 : s_fp32;
|
||||
int & n = k == 0 ? n_fp16 : n_fp32;
|
||||
|
||||
struct ggml_init_params gparams = {
|
||||
/*.mem_size =*/ buf.size(),
|
||||
/*.mem_buffer =*/ buf.data(),
|
||||
};
|
||||
|
||||
struct ggml_context * ctx0 = ggml_init(gparams);
|
||||
|
||||
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
|
||||
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
|
||||
|
||||
struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
|
||||
|
||||
struct ggml_cgraph gf = ggml_build_forward(c);
|
||||
|
||||
gf.n_threads = n_threads;
|
||||
|
||||
double tsum = 0.0;
|
||||
|
||||
// heat-up
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
for (int i = 0; i < n_max; ++i) {
|
||||
const int64_t t0 = ggml_time_us();
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
const int64_t t1 = ggml_time_us();
|
||||
|
||||
tsum += (t1 - t0)*1e-6;
|
||||
n++;
|
||||
|
||||
if (tsum > 1.0 && n >= 3) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
||||
}
|
||||
|
||||
fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
|
||||
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
// =================================================================================================
|
||||
|
||||
//
|
||||
// Experimental stuff below
|
||||
//
|
||||
|
@@ -350,6 +350,13 @@ extern "C" {
|
||||
// Get the probability of the specified token in the specified segment.
|
||||
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Temporary helpers needed for exposing ggml interface
|
||||
|
||||
WHISPER_API int whisper_bench_memcpy(int n_threads);
|
||||
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user