mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-04 00:11:12 +02:00
Compare commits
6 Commits
v1.5.0
...
quantize-e
Author | SHA1 | Date | |
---|---|---|---|
ec96d68402 | |||
bebf0da983 | |||
848e54f3ad | |||
7883d1cae4 | |||
ccc85b4ff8 | |||
c7606b47df |
Submodule bindings/ios updated: b5a163decd...c9d5095f0c
@ -9,6 +9,11 @@ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
||||
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
||||
{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
|
||||
{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
|
||||
{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
|
||||
{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
|
||||
{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
|
||||
};
|
||||
|
||||
void ggml_print_ftypes(FILE * fp) {
|
||||
@ -48,15 +53,15 @@ bool ggml_common_quantize_0(
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
|
||||
case GGML_FTYPE_UNKNOWN:
|
||||
case GGML_FTYPE_ALL_F32:
|
||||
case GGML_FTYPE_MOSTLY_F16:
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
case GGML_FTYPE_MOSTLY_Q2_K:
|
||||
case GGML_FTYPE_MOSTLY_Q3_K:
|
||||
case GGML_FTYPE_MOSTLY_Q4_K:
|
||||
case GGML_FTYPE_MOSTLY_Q5_K:
|
||||
case GGML_FTYPE_MOSTLY_Q6_K:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||
return false;
|
||||
@ -167,24 +172,17 @@ bool ggml_common_quantize_0(
|
||||
|
||||
switch ((ggml_type) ttype) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
@ -192,11 +190,6 @@ bool ggml_common_quantize_0(
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
|
@ -162,6 +162,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
||||
"encoder.conv2.bias",
|
||||
"encoder.positional_embedding",
|
||||
"decoder.positional_embedding",
|
||||
"decoder.*",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
|
||||
|
@ -53,7 +53,7 @@ struct whisper_params {
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
int32_t n_gpu_layers = 0;
|
||||
int32_t n_gpu_layers = 999;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
@ -136,7 +136,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7s] number of layers to store in VRAM\n", params.n_gpu_layers);
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
@ -686,8 +686,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
text_to_speak = ::replace(text_to_speak, "\"", "");
|
||||
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
|
||||
int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "%s: failed to speak\n", __func__);
|
||||
}
|
||||
|
@ -76,3 +76,27 @@ git clone https://huggingface.co/openai/whisper-medium
|
||||
# convert the model to ggml
|
||||
python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
|
||||
```
|
||||
|
||||
## Distilled models
|
||||
|
||||
Initial support for https://huggingface.co/distil-whisper is available.
|
||||
|
||||
Currently, the chunk-based transcription strategy is not implemented, so there can be sub-optimal quality when using the distilled models with `whisper.cpp`.
|
||||
|
||||
```bash
|
||||
# clone OpenAI whisper and whisper.cpp
|
||||
git clone https://github.com/openai/whisper
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
|
||||
# get the models
|
||||
cd whisper.cpp/models
|
||||
git clone https://huggingface.co/distil-whisper/distil-medium.en
|
||||
git clone https://huggingface.co/distil-whisper/distil-large-v2
|
||||
|
||||
# convert to ggml
|
||||
python3 ./convert-h5-to-ggml.py ./distil-medium.en/ ../../whisper .
|
||||
mv ggml-model.bin ggml-medium.en-distil.bin
|
||||
|
||||
python3 ./convert-h5-to-ggml.py ./distil-large-v2/ ../../whisper .
|
||||
mv ggml-model.bin ggml-large-v2-distil.bin
|
||||
```
|
||||
|
48
whisper.cpp
48
whisper.cpp
@ -850,7 +850,8 @@ struct whisper_context {
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
|
||||
ggml_type wtype_e = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) Encoder
|
||||
ggml_type wtype_d = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) Decoder
|
||||
ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
|
||||
|
||||
whisper_context_params params;
|
||||
@ -1168,8 +1169,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wctx.wtype == GGML_TYPE_COUNT) {
|
||||
wctx.wtype_e = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wctx.wtype_e == GGML_TYPE_COUNT) {
|
||||
WHISPER_LOG_ERROR("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
|
||||
return false;
|
||||
}
|
||||
@ -1290,8 +1291,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
WHISPER_LOG_INFO("%s: n_langs = %d\n", __func__, vocab.num_languages());
|
||||
}
|
||||
|
||||
const ggml_type wtype = wctx.wtype;
|
||||
const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
|
||||
const ggml_type wtype_e = wctx.wtype_e;
|
||||
const ggml_type wtype_d = wctx.wtype_d;
|
||||
const ggml_type vtype = wctx.wtype_e == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
@ -1367,24 +1369,24 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, 4*n_audio_state);
|
||||
layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
|
||||
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype_e, 4*n_audio_state, n_audio_state);
|
||||
layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
// map by name
|
||||
@ -1417,7 +1419,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
{
|
||||
model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
|
||||
|
||||
model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
|
||||
model.d_te = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_vocab);
|
||||
|
||||
model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
@ -1436,38 +1438,38 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, 4*n_text_state);
|
||||
layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
|
||||
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype_d, 4*n_text_state, n_text_state);
|
||||
layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
|
||||
layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
// map by name
|
||||
@ -6073,7 +6075,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
||||
size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
|
||||
|
||||
// 1GB array
|
||||
const size_t size = arr*1e9;
|
||||
const size_t size = arr*1e6;
|
||||
|
||||
// single-thread
|
||||
{
|
||||
|
Reference in New Issue
Block a user