mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-01 15:00:31 +02:00
Compare commits
13 Commits
batched
...
quantize-e
Author | SHA1 | Date | |
---|---|---|---|
ec96d68402 | |||
bebf0da983 | |||
848e54f3ad | |||
7883d1cae4 | |||
ccc85b4ff8 | |||
c7606b47df | |||
d38af151a1 | |||
94267df08e | |||
8713c67133 | |||
57a60639bb | |||
bfbaa4dce5 | |||
1d79e78402 | |||
b6c5f49b78 |
@ -1,6 +1,6 @@
|
||||
cmake_minimum_required (VERSION 3.5)
|
||||
|
||||
project(whisper.cpp VERSION 1.4.3)
|
||||
project(whisper.cpp VERSION 1.5.0)
|
||||
|
||||
# Add path to modules
|
||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||
|
4
Makefile
4
Makefile
@ -418,9 +418,9 @@ samples:
|
||||
.PHONY: medium
|
||||
.PHONY: large-v1
|
||||
.PHONY: large-v2
|
||||
.PHONY: large
|
||||
.PHONY: large-v3
|
||||
|
||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
|
||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
|
||||
bash ./models/download-ggml-model.sh $@
|
||||
@echo ""
|
||||
@echo "==============================================="
|
||||
|
18
README.md
18
README.md
@ -6,7 +6,7 @@
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Beta: [v1.4.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.3) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
Stable: [v1.5.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
@ -231,18 +231,18 @@ make medium.en
|
||||
make medium
|
||||
make large-v1
|
||||
make large-v2
|
||||
make large
|
||||
make large-v3
|
||||
```
|
||||
|
||||
## Memory usage
|
||||
|
||||
| Model | Disk | Mem | SHA |
|
||||
| --- | --- | --- | --- |
|
||||
| tiny | 75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
|
||||
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
||||
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
||||
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||
| large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
||||
| Model | Disk | Mem |
|
||||
| --- | --- | --- |
|
||||
| tiny | 75 MiB | ~273 MB |
|
||||
| base | 142 MiB | ~388 MB |
|
||||
| small | 466 MiB | ~852 MB |
|
||||
| medium | 1.5 GiB | ~2.1 GB |
|
||||
| large | 2.9 GiB | ~3.9 GB |
|
||||
|
||||
## Quantization
|
||||
|
||||
|
@ -24,7 +24,7 @@ const (
|
||||
|
||||
var (
|
||||
// The models which will be downloaded, if no model is specified as an argument
|
||||
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
|
||||
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
|
||||
)
|
||||
|
||||
var (
|
||||
|
Submodule bindings/ios updated: 9752de4100...c9d5095f0c
@ -45,7 +45,7 @@ class WhisperCppTest {
|
||||
assertEquals(16384, params.n_max_text_ctx);
|
||||
assertFalse(params.translate);
|
||||
assertEquals(0.01f, params.thold_pt);
|
||||
assertEquals(2, params.beam_search.beam_size);
|
||||
assertEquals(5, params.beam_search.beam_size);
|
||||
assertEquals(-1.0f, params.beam_search.patience);
|
||||
}
|
||||
|
||||
@ -58,7 +58,7 @@ class WhisperCppTest {
|
||||
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
|
||||
assertNotEquals(0, params.n_threads);
|
||||
assertEquals(16384, params.n_max_text_ctx);
|
||||
assertEquals(2, params.greedy.best_of);
|
||||
assertEquals(5, params.greedy.best_of);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.4.3",
|
||||
"version": "1.5.0",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
File diff suppressed because one or more lines are too long
@ -9,6 +9,11 @@ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
||||
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
||||
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
||||
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
||||
{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
|
||||
{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
|
||||
{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
|
||||
{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
|
||||
{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
|
||||
};
|
||||
|
||||
void ggml_print_ftypes(FILE * fp) {
|
||||
@ -48,15 +53,15 @@ bool ggml_common_quantize_0(
|
||||
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
||||
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
||||
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
|
||||
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
|
||||
case GGML_FTYPE_UNKNOWN:
|
||||
case GGML_FTYPE_ALL_F32:
|
||||
case GGML_FTYPE_MOSTLY_F16:
|
||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||
case GGML_FTYPE_MOSTLY_Q2_K:
|
||||
case GGML_FTYPE_MOSTLY_Q3_K:
|
||||
case GGML_FTYPE_MOSTLY_Q4_K:
|
||||
case GGML_FTYPE_MOSTLY_Q5_K:
|
||||
case GGML_FTYPE_MOSTLY_Q6_K:
|
||||
{
|
||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||
return false;
|
||||
@ -167,24 +172,17 @@ bool ggml_common_quantize_0(
|
||||
|
||||
switch ((ggml_type) ttype) {
|
||||
case GGML_TYPE_Q4_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q4_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_0:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q5_1:
|
||||
{
|
||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
{
|
||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
||||
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
@ -192,11 +190,6 @@ bool ggml_common_quantize_0(
|
||||
case GGML_TYPE_I16:
|
||||
case GGML_TYPE_I32:
|
||||
case GGML_TYPE_Q8_1:
|
||||
case GGML_TYPE_Q2_K:
|
||||
case GGML_TYPE_Q3_K:
|
||||
case GGML_TYPE_Q4_K:
|
||||
case GGML_TYPE_Q5_K:
|
||||
case GGML_TYPE_Q6_K:
|
||||
case GGML_TYPE_Q8_K:
|
||||
case GGML_TYPE_COUNT:
|
||||
{
|
||||
|
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
|
||||
fi
|
||||
|
||||
# Whisper models
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||
|
||||
# list available models
|
||||
function list_models {
|
||||
|
@ -162,6 +162,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
||||
"encoder.conv2.bias",
|
||||
"encoder.positional_embedding",
|
||||
"decoder.positional_embedding",
|
||||
"decoder.*",
|
||||
};
|
||||
|
||||
if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
|
||||
|
@ -53,7 +53,7 @@ struct whisper_params {
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
int32_t n_gpu_layers = 0;
|
||||
int32_t n_gpu_layers = 999;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
@ -136,7 +136,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7s] number of layers to store in VRAM\n", params.n_gpu_layers);
|
||||
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
@ -686,8 +686,8 @@ int main(int argc, char ** argv) {
|
||||
}
|
||||
}
|
||||
|
||||
text_to_speak = ::replace(text_to_speak, "\"", "");
|
||||
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
||||
text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
|
||||
int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
|
||||
if (ret != 0) {
|
||||
fprintf(stderr, "%s: failed to speak\n", __func__);
|
||||
}
|
||||
|
@ -21,7 +21,7 @@ help()
|
||||
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
|
||||
echo "options:"
|
||||
echo "-s Step in seconds (default is $step)."
|
||||
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
|
||||
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
|
||||
echo "-t Number of threads to use."
|
||||
echo "-h Print this help page."
|
||||
echo
|
||||
|
@ -17,12 +17,12 @@ else
|
||||
encoder_only=$2
|
||||
fi
|
||||
|
||||
models=( \
|
||||
"tiny" "tiny-q4_0" "tiny-q4_1" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
|
||||
"base" "base-q4_0" "base-q4_1" "base-q5_0" "base-q5_1" "base-q8_0" \
|
||||
"small" "small-q4_0" "small-q4_1" "small-q5_0" "small-q5_1" "small-q8_0" \
|
||||
"medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
|
||||
"large" "large-q4_0" "large-q4_1" "large-q5_0" "large-q5_1" "large-q8_0" \
|
||||
models=( \
|
||||
"tiny" "tiny-q4_0" "tiny-q4_1" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
|
||||
"base" "base-q4_0" "base-q4_1" "base-q5_0" "base-q5_1" "base-q8_0" \
|
||||
"small" "small-q4_0" "small-q4_1" "small-q5_0" "small-q5_1" "small-q8_0" \
|
||||
"medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" "medium-dis" \
|
||||
"large-v2" "large-v2-q4_0" "large-v2-q4_1" "large-v2-q5_0" "large-v2-q5_1" "large-v2-q8_0" "large-v2-dis" \
|
||||
)
|
||||
|
||||
if [ "$encoder_only" -eq 0 ]; then
|
||||
@ -44,8 +44,8 @@ if [ "$encoder_only" -eq 0 ]; then
|
||||
printf "\n"
|
||||
fi
|
||||
|
||||
printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "Bch5" "PP" "Commit"
|
||||
printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" "---"
|
||||
printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "Bch5" "PP" "Commit"
|
||||
printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" "---"
|
||||
|
||||
for model in "${models[@]}"; do
|
||||
# actual run
|
||||
@ -95,6 +95,6 @@ for model in "${models[@]}"; do
|
||||
commit=$(git rev-parse --short HEAD)
|
||||
|
||||
if [ $ret -eq 0 ]; then
|
||||
printf "| <todo> | <todo> | %16s | %11s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit"
|
||||
printf "| <todo> | <todo> | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit"
|
||||
fi
|
||||
done
|
||||
|
@ -1,6 +1,6 @@
|
||||
#!/bin/bash
|
||||
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||
|
||||
for model in "${models[@]}"; do
|
||||
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
|
||||
|
18
ggml-metal.m
18
ggml-metal.m
@ -346,9 +346,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||
if (ctx->device.maxTransferRate != 0) {
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||
}
|
||||
@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);
|
||||
|
||||
++ctx->n_buffers;
|
||||
} else {
|
||||
@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
|
||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||
|
||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
|
||||
return false;
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
|
||||
if (i + size_step < size) {
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
@ -580,8 +580,8 @@ bool ggml_metal_add_buffer(
|
||||
|
||||
#if TARGET_OS_OSX
|
||||
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||
ctx->device.currentAllocatedSize / 1e6,
|
||||
ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||
|
||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
||||
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||
@ -589,7 +589,7 @@ bool ggml_metal_add_buffer(
|
||||
GGML_METAL_LOG_INFO("\n");
|
||||
}
|
||||
#else
|
||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -39,19 +39,19 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
|
||||
|
||||
## Available models
|
||||
|
||||
| Model | Disk | Mem | SHA |
|
||||
| --- | --- | --- | --- |
|
||||
| tiny | 75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
|
||||
| tiny.en | 75 MB | ~390 MB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
|
||||
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
||||
| base.en | 142 MB | ~500 MB | `137c40403d78fd54d454da0f9bd998f78703390c` |
|
||||
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
||||
| small.en | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
|
||||
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
|
||||
| large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
|
||||
| large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
||||
| large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
||||
| Model | Disk | SHA |
|
||||
| --- | --- | --- |
|
||||
| tiny | 75 MiB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
|
||||
| tiny.en | 75 MiB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
|
||||
| base | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
||||
| base.en | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
|
||||
| small | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
||||
| small.en | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
|
||||
| medium | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||
| medium.en | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
|
||||
| large-v1 | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
|
||||
| large-v2 | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
||||
| large-v3 | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
||||
|
||||
## Model files for testing purposes
|
||||
|
||||
@ -76,3 +76,27 @@ git clone https://huggingface.co/openai/whisper-medium
|
||||
# convert the model to ggml
|
||||
python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
|
||||
```
|
||||
|
||||
## Distilled models
|
||||
|
||||
Initial support for https://huggingface.co/distil-whisper is available.
|
||||
|
||||
Currently, the chunk-based transcription strategy is not implemented, so there can be sub-optimal quality when using the distilled models with `whisper.cpp`.
|
||||
|
||||
```bash
|
||||
# clone OpenAI whisper and whisper.cpp
|
||||
git clone https://github.com/openai/whisper
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
|
||||
# get the models
|
||||
cd whisper.cpp/models
|
||||
git clone https://huggingface.co/distil-whisper/distil-medium.en
|
||||
git clone https://huggingface.co/distil-whisper/distil-large-v2
|
||||
|
||||
# convert to ggml
|
||||
python3 ./convert-h5-to-ggml.py ./distil-medium.en/ ../../whisper .
|
||||
mv ggml-model.bin ggml-medium.en-distil.bin
|
||||
|
||||
python3 ./convert-h5-to-ggml.py ./distil-large-v2/ ../../whisper .
|
||||
mv ggml-model.bin ggml-large-v2-distil.bin
|
||||
```
|
||||
|
@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
|
||||
# Ported from models/convert-whisper-to-coreml.py
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
||||
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
|
||||
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
|
||||
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
||||
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
||||
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
||||
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
|
||||
raise ValueError("Invalid model name")
|
||||
|
||||
pt_target_path = f"models/hf-{args.model_name}.pt"
|
||||
|
@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
||||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
|
||||
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
||||
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
||||
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
||||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
|
||||
raise ValueError("Invalid model name")
|
||||
|
||||
whisper = load_model(args.model).cpu()
|
||||
|
@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
||||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
||||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
|
||||
raise ValueError("Invalid model name")
|
||||
|
||||
whisper = load_model(args.model).cpu()
|
||||
|
@ -19,7 +19,7 @@ function get_script_path() {
|
||||
models_path="$(get_script_path)"
|
||||
|
||||
# Whisper models
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||
|
||||
# list available models
|
||||
function list_models {
|
||||
|
@ -8,7 +8,7 @@ popd
|
||||
set argc=0
|
||||
for %%x in (%*) do set /A argc+=1
|
||||
|
||||
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
|
||||
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3
|
||||
|
||||
if %argc% neq 1 (
|
||||
echo.
|
||||
|
@ -22,7 +22,7 @@ function get_script_path() {
|
||||
models_path="$(get_script_path)"
|
||||
|
||||
# Whisper models
|
||||
models=(
|
||||
models=(
|
||||
"tiny.en"
|
||||
"tiny"
|
||||
"tiny-q5_1"
|
||||
@ -42,7 +42,7 @@ models=(
|
||||
"medium.en-q5_0"
|
||||
"large-v1"
|
||||
"large-v2"
|
||||
"large"
|
||||
"large-v3"
|
||||
"large-q5_0"
|
||||
)
|
||||
|
||||
|
@ -19,7 +19,7 @@
|
||||
cd `dirname $0`
|
||||
|
||||
# Whisper models
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||
|
||||
# list available models
|
||||
function list_models {
|
||||
|
92
whisper.cpp
92
whisper.cpp
@ -850,8 +850,9 @@ struct whisper_context {
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
|
||||
ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
|
||||
ggml_type wtype_e = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) Encoder
|
||||
ggml_type wtype_d = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) Decoder
|
||||
ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
|
||||
|
||||
whisper_context_params params;
|
||||
|
||||
@ -1168,8 +1169,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
|
||||
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||
// in order to save memory and also to speed up the computation
|
||||
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wctx.wtype == GGML_TYPE_COUNT) {
|
||||
wctx.wtype_e = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||
if (wctx.wtype_e == GGML_TYPE_COUNT) {
|
||||
WHISPER_LOG_ERROR("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
|
||||
return false;
|
||||
}
|
||||
@ -1290,8 +1291,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
WHISPER_LOG_INFO("%s: n_langs = %d\n", __func__, vocab.num_languages());
|
||||
}
|
||||
|
||||
const ggml_type wtype = wctx.wtype;
|
||||
const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
|
||||
const ggml_type wtype_e = wctx.wtype_e;
|
||||
const ggml_type wtype_d = wctx.wtype_d;
|
||||
const ggml_type vtype = wctx.wtype_e == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
|
||||
|
||||
// create the ggml context
|
||||
{
|
||||
@ -1367,24 +1369,24 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, 4*n_audio_state);
|
||||
layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
|
||||
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype_e, 4*n_audio_state, n_audio_state);
|
||||
layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_e, n_audio_state, n_audio_state);
|
||||
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
|
||||
|
||||
// map by name
|
||||
@ -1417,7 +1419,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
{
|
||||
model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
|
||||
|
||||
model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
|
||||
model.d_te = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_vocab);
|
||||
|
||||
model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
@ -1436,38 +1438,38 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
|
||||
layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, 4*n_text_state);
|
||||
layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
|
||||
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
|
||||
layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype_d, 4*n_text_state, n_text_state);
|
||||
layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
|
||||
layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
|
||||
layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype_d, n_text_state, n_text_state);
|
||||
layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
|
||||
|
||||
// map by name
|
||||
@ -1522,7 +1524,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
|
||||
model.buffer = ggml_backend_alloc_buffer(wctx.backend, size_main);
|
||||
|
||||
WHISPER_LOG_INFO("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1024.0 / 1024.0);
|
||||
WHISPER_LOG_INFO("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6);
|
||||
}
|
||||
|
||||
ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
|
||||
@ -1637,12 +1639,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
|
||||
ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
|
||||
}
|
||||
|
||||
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
|
||||
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1e6);
|
||||
total_size += ggml_nbytes(tensor);
|
||||
model.n_loaded++;
|
||||
}
|
||||
|
||||
WHISPER_LOG_INFO("%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
|
||||
WHISPER_LOG_INFO("%s: model size = %7.2f MB\n", __func__, total_size/1e6);
|
||||
|
||||
if (model.n_loaded == 0) {
|
||||
WHISPER_LOG_WARN("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
|
||||
@ -2027,11 +2029,11 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
|
||||
// ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
|
||||
// ggml_used_mem(ctx0)/1e6,
|
||||
// wstate.get_buf_max_mem(0)/1e6,
|
||||
// wstate.get_buf_max_mem(1)/1e6,
|
||||
// wstate.get_buf_max_mem(2)/1e6,
|
||||
// wstate.get_buf_max_mem(3)/1e6);
|
||||
|
||||
ggml_free(ctx0);
|
||||
|
||||
@ -2613,11 +2615,11 @@ static bool whisper_decode_internal(
|
||||
|
||||
if (batch.n_tokens > 1) {
|
||||
//printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
|
||||
// ggml_used_mem(ctx0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(0)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(1)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(2)/1024.0/1024.0,
|
||||
// wstate.get_buf_max_mem(3)/1024.0/1024.0);
|
||||
// ggml_used_mem(ctx0)/1e6,
|
||||
// wstate.get_buf_max_mem(0)/1e6,
|
||||
// wstate.get_buf_max_mem(1)/1e6,
|
||||
// wstate.get_buf_max_mem(2)/1e6,
|
||||
// wstate.get_buf_max_mem(3)/1e6);
|
||||
}
|
||||
|
||||
if (batch.n_tokens == 1) {
|
||||
@ -3057,7 +3059,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
|
||||
{
|
||||
const size_t memory_size = ggml_nbytes(state->kv_self.k) + ggml_nbytes(state->kv_self.v);
|
||||
WHISPER_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
WHISPER_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1e6);
|
||||
}
|
||||
|
||||
if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
|
||||
@ -3068,7 +3070,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
|
||||
{
|
||||
const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
|
||||
WHISPER_LOG_INFO("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||
WHISPER_LOG_INFO("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1e6);
|
||||
}
|
||||
|
||||
#ifdef WHISPER_USE_COREML
|
||||
@ -3110,7 +3112,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
return whisper_build_graph_conv(*ctx, *state, 0);
|
||||
});
|
||||
|
||||
WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1024.0 / 1024.0);
|
||||
WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
|
||||
}
|
||||
|
||||
// encoder allocator
|
||||
@ -3120,7 +3122,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
return whisper_build_graph_encoder(*ctx, *state);
|
||||
});
|
||||
|
||||
WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1024.0 / 1024.0);
|
||||
WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
|
||||
}
|
||||
|
||||
// cross allocator
|
||||
@ -3130,7 +3132,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
return whisper_build_graph_cross(*ctx, *state);
|
||||
});
|
||||
|
||||
WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1024.0 / 1024.0);
|
||||
WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
|
||||
}
|
||||
|
||||
// decoder allocator
|
||||
@ -3148,7 +3150,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
|
||||
return whisper_build_graph_decoder(*ctx, *state, state->batch);
|
||||
});
|
||||
|
||||
WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1024.0 / 1024.0);
|
||||
WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
|
||||
}
|
||||
|
||||
whisper_allocr_graph_realloc(state->alloc_conv, ctx->backend);
|
||||
@ -6072,8 +6074,8 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
||||
size_t n = 20;
|
||||
size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
|
||||
|
||||
// 1GB MB array
|
||||
const size_t size = arr*1024llu*1024llu;
|
||||
// 1GB array
|
||||
const size_t size = arr*1e6;
|
||||
|
||||
// single-thread
|
||||
{
|
||||
@ -6099,7 +6101,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
||||
src[rand() % size] = rand() % 256;
|
||||
}
|
||||
|
||||
snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
||||
snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1e9));
|
||||
s += strbuf;
|
||||
|
||||
// needed to prevent the compiler from optimizing the memcpy away
|
||||
|
Reference in New Issue
Block a user