Compare commits

...

11 Commits

Author SHA1 Message Date
Georgi Gerganov
2c3f50a021 release : v1.1.1 2023-01-23 20:23:44 +02:00
Georgi Gerganov
9a65269a20 .gitignore : add arm_neon.h 2023-01-23 20:19:04 +02:00
Georgi Gerganov
78f166174f whisper : fix condition for providing past prompt (critical)
This bug has been present since v1.1.0.

Effectively, the past transcribed text wasn't being used for following
transcriptions, which likely significantly reduces the transcription
quality.

Likely related to #419
2023-01-22 10:47:01 +02:00
Georgi Gerganov
21c569ba4a whisper : extend information in whisper_print_timings() 2023-01-19 18:50:33 +02:00
Georgi Gerganov
1a91c19af9 whisper : perform entropy check only when we have at least 32 tokens (#412) 2023-01-18 22:52:18 +02:00
Georgi Gerganov
f583e2d2f5 main : we had accidentally disabled the temperature fallback .. (#291) 2023-01-18 22:51:41 +02:00
Georgi Gerganov
206fc93396 whisper.wasm : add small and small.en models 2023-01-18 21:58:55 +02:00
Georgi Gerganov
a6cf6f4c4a bench : minor fixes 2023-01-18 21:40:10 +02:00
Chia-Hsiang Cheng
472a473fd1 main : add an option to accept optional output filenames (#424)
* Add an option to accept optional output filenames

* Format the file

Co-authored-by: Chia-Hsiang Cheng <gary.chiahsiang.cheng@gmail.com>
2023-01-18 21:26:31 +02:00
Georgi Gerganov
9ba66c2fad stream : fix handling of --step == --length (#416) 2023-01-18 21:22:52 +02:00
Georgi Gerganov
1ccb8a46a5 bench : fix Windows linkage by moving ggml benches in whisper lib .. 2023-01-18 21:19:50 +02:00
14 changed files with 268 additions and 216 deletions

1
.gitignore vendored
View File

@@ -18,6 +18,7 @@ build-sanitize-thread/
/talk
/bench
arm_neon.h
sync.sh
libwhisper.a
libwhisper.so

View File

@@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.0)
project(whisper.cpp VERSION 1.1.0)
project(whisper.cpp VERSION 1.1.1)
# Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

View File

@@ -4,7 +4,7 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Stable: [v1.0.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.0.4) / Beta: [v1.1.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.1.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Stable: [v1.1.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.1.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

View File

@@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.1.0",
"version": "1.1.1",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {

File diff suppressed because one or more lines are too long

View File

@@ -1,11 +1,8 @@
#include "ggml.h"
#include "whisper.h"
#include <cstdio>
#include <cstring>
#include <string>
#include <thread>
#include <vector>
// command-line parameters
struct whisper_params {
@@ -53,7 +50,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "\n");
}
int bench_whisper_encoder(const whisper_params & params) {
int whisper_bench_encoder(const whisper_params & params) {
// whisper init
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -96,132 +93,6 @@ int bench_whisper_encoder(const whisper_params & params) {
return 0;
}
int bench_memcpy(const whisper_params & params) {
size_t n = 50;
size_t arr = params.what > 0 ? 1024 : params.what; // trick to avoid compiler optimizations
// 1 GB array
const size_t size = arr*1024llu*1024llu;
char * src = (char *) malloc(size);
char * dst = (char *) malloc(size);
for (size_t i = 0; i < size; i++) src[i] = i;
memcpy(dst, src, size); // heat-up
double tsum = 0.0;
for (size_t i = 0; i < n; i++) {
const int64_t t0 = ggml_time_us();
memcpy(dst, src, size);
const int64_t t1 = ggml_time_us();
tsum += (t1 - t0)*1e-6;
src[0] = rand();
}
fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
// needed to prevent the compile from optimizing the memcpy away
{
double sum = 0.0;
for (size_t i = 0; i < size; i++) sum += dst[i];
fprintf(stderr, "sum: %s\n", sum == -536870910.00 ? "ok" : "error");
}
free(src);
free(dst);
return 0;
}
int bench_ggml_mul_mat(const whisper_params & params) {
const int n_max = 128;
const std::vector<size_t> sizes = {
64, 128, 256, 512, 1024, 2048, 4096,
};
const size_t N_max = sizes.back();
// a: N*N*sizeof(float)
// b: N*N*sizeof(float)
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
for (int j = 0; j < (int) sizes.size(); j++) {
int n_fp16 = 0;
int n_fp32 = 0;
// GFLOPS/s
double s_fp16 = 0.0;
double s_fp32 = 0.0;
const size_t N = sizes[j];
for (int k = 0; k < 2; ++k) {
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
double & s = k == 0 ? s_fp16 : s_fp32;
int & n = k == 0 ? n_fp16 : n_fp32;
struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
/*.mem_buffer =*/ buf.data(),
};
struct ggml_context * ctx0 = ggml_init(gparams);
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
struct ggml_cgraph gf = ggml_build_forward(c);
gf.n_threads = params.n_threads;
double tsum = 0.0;
// heat-up
ggml_graph_compute(ctx0, &gf);
for (int i = 0; i < n_max; ++i) {
const int64_t t0 = ggml_time_us();
ggml_graph_compute(ctx0, &gf);
const int64_t t1 = ggml_time_us();
tsum += (t1 - t0)*1e-6;
n++;
if (tsum > 1.0 && n >= 3) {
break;
}
}
ggml_free(ctx0);
s = ((2.0*N*N*N*n)/tsum)*1e-9;
}
fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
}
return 0;
}
int main(int argc, char ** argv) {
whisper_params params;
@@ -229,14 +100,12 @@ int main(int argc, char ** argv) {
return 1;
}
ggml_time_init();
int ret = -1;
switch (params.what) {
case 0: ret = bench_whisper_encoder(params); break;
case 1: ret = bench_memcpy(params); break;
case 2: ret = bench_ggml_mul_mat(params); break;
case 0: ret = whisper_bench_encoder(params); break;
case 1: ret = whisper_bench_memcpy(params.n_threads); break;
case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
}

View File

@@ -84,6 +84,7 @@ struct whisper_params {
std::string model = "models/ggml-base.en.bin";
std::vector<std::string> fname_inp = {};
std::vector<std::string> fname_outp = {};
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@@ -121,6 +122,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
else if (arg == "-of" || arg == "--output-file") { params.fname_outp.emplace_back(argv[++i]); }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
@@ -144,35 +146,36 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false");
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, "\n");
}
@@ -514,6 +517,7 @@ int main(int argc, char ** argv) {
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_outp = f < params.fname_outp.size() && !params.fname_outp[f].empty() ? params.fname_outp[f] : params.fname_inp[f];
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@@ -654,7 +658,6 @@ int main(int argc, char ** argv) {
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.temperature_inc = -1;
wparams.prompt_tokens = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.empty() ? 0 : prompt_tokens.size();
@@ -692,31 +695,31 @@ int main(int argc, char ** argv) {
// output to text file
if (params.output_txt) {
const auto fname_txt = fname_inp + ".txt";
const auto fname_txt = fname_outp + ".txt";
output_txt(ctx, fname_txt.c_str());
}
// output to VTT file
if (params.output_vtt) {
const auto fname_vtt = fname_inp + ".vtt";
const auto fname_vtt = fname_outp + ".vtt";
output_vtt(ctx, fname_vtt.c_str());
}
// output to SRT file
if (params.output_srt) {
const auto fname_srt = fname_inp + ".srt";
const auto fname_srt = fname_outp + ".srt";
output_srt(ctx, fname_srt.c_str(), params);
}
// output to WTS file
if (params.output_wts) {
const auto fname_wts = fname_inp + ".wts";
const auto fname_wts = fname_outp + ".wts";
output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
}
// output to CSV file
if (params.output_csv) {
const auto fname_csv = fname_inp + ".csv";
const auto fname_csv = fname_outp + ".csv";
output_csv(ctx, fname_csv.c_str());
}

View File

@@ -423,7 +423,8 @@ int main(int argc, char ** argv) {
return 1;
}
params.keep_ms = std::min(params.keep_ms, params.step_ms); // cannot be more than step_ms
params.keep_ms = std::min(params.keep_ms, params.step_ms);
params.length_ms = std::max(params.length_ms, params.step_ms);
const int n_samples_step = (params.step_ms *1e-3)*WHISPER_SAMPLE_RATE;
const int n_samples_len = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
@@ -432,7 +433,7 @@ int main(int argc, char ** argv) {
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
const int n_new_line = !use_vad ? params.length_ms / params.step_ms - 1 : 1; // number of steps to print new line
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
params.no_timestamps = !use_vad;
params.no_context |= use_vad;

View File

@@ -32,8 +32,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \
-s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \
-s INITIAL_MEMORY=1024MB \
-s TOTAL_MEMORY=1024MB \
-s INITIAL_MEMORY=1500MB \
-s TOTAL_MEMORY=1500MB \
-s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \

View File

@@ -46,10 +46,12 @@
<div id="model">
Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
<button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button>
<span id="fetch-whisper-progress"></span>
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
@@ -284,27 +286,33 @@
}
reader.readAsArrayBuffer(file);
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
document.getElementById('fetch-whisper-base' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
document.getElementById('fetch-whisper-base' ).style.display = 'none';
document.getElementById('fetch-whisper-small' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
}
function loadWhisper(model) {
let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'tiny': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'tiny': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
};
let sizes = {
'tiny.en': 75,
'tiny': 75,
'base.en': 142,
'base': 142,
'tiny.en': 75,
'tiny': 75,
'base.en': 142,
'base': 142,
'small.en': 466,
'small': 466,
};
let url = urls[model];
@@ -313,12 +321,14 @@
model_whisper = model;
document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
document.getElementById('fetch-whisper-base' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
document.getElementById('fetch-whisper-tiny-en' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
document.getElementById('fetch-whisper-base' ).style.display = 'none';
document.getElementById('fetch-whisper-small' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loading model: ' + model;
cbProgress = function(p) {
let el = document.getElementById('fetch-whisper-progress');
@@ -327,12 +337,14 @@
cbCancel = function() {
var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status' ); if (el) el.innerHTML = '';
};
loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);

View File

@@ -19,7 +19,7 @@ printf "\n"
./bench -w 1 -t 1 2>&1
printf "\n"
printf "Running ggml_mul_mat benchmark with " $n_threads " threads\n"
printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
printf "\n"
./bench -w 2 -t $n_threads 2>&1

View File

@@ -474,6 +474,12 @@ struct whisper_context {
int64_t t_decode_us = 0;
int64_t t_start_us = 0;
int32_t n_sample = 0; // number of tokens sampled
int32_t n_encode = 0; // number of encoder calls
int32_t n_decode = 0; // number of decoder calls
int32_t n_fail_p = 0; // number of logprob threshold failures
int32_t n_fail_h = 0; // number of entropy threshold failures
ggml_type wtype; // weight type (FP32 or FP16)
whisper_mel mel;
@@ -1620,6 +1626,7 @@ static bool whisper_encode(
ggml_free(ctx0);
wctx.t_encode_us += ggml_time_us() - t_start_us;
wctx.n_encode++;
return true;
}
@@ -1993,6 +2000,7 @@ static bool whisper_decode(
ggml_free(ctx0);
wctx.t_decode_us += ggml_time_us() - t_start_us;
wctx.n_decode++;
return true;
}
@@ -2644,12 +2652,17 @@ whisper_token whisper_token_transcribe(void) {
void whisper_print_timings(struct whisper_context * ctx) {
const int64_t t_end_us = ggml_time_us();
const int32_t n_sample = std::max(1, ctx->n_sample);
const int32_t n_encode = std::max(1, ctx->n_encode);
const int32_t n_decode = std::max(1, ctx->n_decode);
fprintf(stderr, "\n");
fprintf(stderr, "%s: fallbacks = %3d p / %3d h\n", __func__, ctx->n_fail_p, ctx->n_fail_h);
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, ctx->t_sample_us/1000.0f);
fprintf(stderr, "%s: encode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_encode_us/1000.0f, ctx->t_encode_us/1000.0f/ctx->model.hparams.n_audio_layer);
fprintf(stderr, "%s: decode time = %8.2f ms / %.2f ms per layer\n", __func__, ctx->t_decode_us/1000.0f, ctx->t_decode_us/1000.0f/ctx->model.hparams.n_text_layer);
fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_sample_us, n_sample, 1e-3f*ctx->t_sample_us/n_sample);
fprintf(stderr, "%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_encode_us, n_encode, 1e-3f*ctx->t_encode_us/n_encode);
fprintf(stderr, "%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_decode_us, n_decode, 1e-3f*ctx->t_decode_us/n_decode);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
}
@@ -3004,7 +3017,7 @@ static void whisper_process_logits(
}
static whisper_token_data whisper_sample_token(
const whisper_context & ctx,
whisper_context & ctx,
const whisper_decoder & decoder,
bool best) {
whisper_token_data result = {
@@ -3059,6 +3072,8 @@ static whisper_token_data whisper_sample_token(
result.pt = result.p;
}
ctx.n_sample++;
return result;
}
@@ -3127,6 +3142,8 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
}
}
ctx.n_sample++;
return result;
}
@@ -3432,7 +3449,7 @@ int whisper_full(
prompt.clear();
// if we have already generated some text, use it as a prompt to condition the next generation
if (!prompt_past.empty() && t_cur > 0.5f) {
if (!prompt_past.empty() && t_cur < 0.5f) {
int n_take = std::min(std::min(params.n_max_text_ctx, whisper_n_text_ctx(ctx)/2), int(prompt_past.size()));
prompt = { whisper_token_prev(ctx) };
@@ -3721,11 +3738,12 @@ int whisper_full(
WHISPER_PRINT_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
__func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
if (decoder.sequence.result_len > 8 && decoder.sequence.entropy < params.entropy_thold) {
if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
WHISPER_PRINT_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
__func__, j, decoder.sequence.entropy, params.entropy_thold);
decoder.failed = true;
ctx->n_fail_h++;
continue;
}
@@ -3747,6 +3765,7 @@ int whisper_full(
if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
success = false;
ctx->n_fail_p++;
}
if (success) {
@@ -3801,6 +3820,7 @@ int whisper_full(
if (tokens_cur[i].id > whisper_token_beg(ctx) && !params.single_segment) {
const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
if (!text.empty()) {
const auto tt0 = params.speed_up ? 2*t0 : t0;
const auto tt1 = params.speed_up ? 2*t1 : t1;
@@ -4059,6 +4079,145 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
// =================================================================================================
//
// Temporary interface needed for exposing ggml interface
// Will be removed in the future when ggml becomes a separate library
//
WHISPER_API int whisper_bench_memcpy(int n_threads) {
ggml_time_init();
size_t n = 50;
size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
// 1 GB array
const size_t size = arr*1024llu*1024llu;
char * src = (char *) malloc(size);
char * dst = (char *) malloc(size);
for (size_t i = 0; i < size; i++) src[i] = i;
memcpy(dst, src, size); // heat-up
double tsum = 0.0;
for (size_t i = 0; i < n; i++) {
const int64_t t0 = ggml_time_us();
memcpy(dst, src, size);
const int64_t t1 = ggml_time_us();
tsum += (t1 - t0)*1e-6;
src[0] = rand();
}
fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
// needed to prevent the compile from optimizing the memcpy away
{
double sum = 0.0;
for (size_t i = 0; i < size; i++) sum += dst[i];
fprintf(stderr, "sum: %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
}
free(src);
free(dst);
return 0;
}
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
ggml_time_init();
const int n_max = 128;
const std::vector<size_t> sizes = {
64, 128, 256, 512, 1024, 2048, 4096,
};
const size_t N_max = sizes.back();
// a: N*N*sizeof(float)
// b: N*N*sizeof(float)
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
for (int j = 0; j < (int) sizes.size(); j++) {
int n_fp16 = 0;
int n_fp32 = 0;
// GFLOPS/s
double s_fp16 = 0.0;
double s_fp32 = 0.0;
const size_t N = sizes[j];
for (int k = 0; k < 2; ++k) {
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
double & s = k == 0 ? s_fp16 : s_fp32;
int & n = k == 0 ? n_fp16 : n_fp32;
struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
/*.mem_buffer =*/ buf.data(),
};
struct ggml_context * ctx0 = ggml_init(gparams);
struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype, N, N);
struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
struct ggml_cgraph gf = ggml_build_forward(c);
gf.n_threads = n_threads;
double tsum = 0.0;
// heat-up
ggml_graph_compute(ctx0, &gf);
for (int i = 0; i < n_max; ++i) {
const int64_t t0 = ggml_time_us();
ggml_graph_compute(ctx0, &gf);
const int64_t t1 = ggml_time_us();
tsum += (t1 - t0)*1e-6;
n++;
if (tsum > 1.0 && n >= 3) {
break;
}
}
ggml_free(ctx0);
s = ((2.0*N*N*N*n)/tsum)*1e-9;
}
fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
}
return 0;
}
// =================================================================================================
// =================================================================================================
//
// Experimental stuff below
//

View File

@@ -350,6 +350,13 @@ extern "C" {
// Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
////////////////////////////////////////////////////////////////////////////
// Temporary helpers needed for exposing ggml interface
WHISPER_API int whisper_bench_memcpy(int n_threads);
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
#ifdef __cplusplus
}
#endif