mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-18 00:08:45 +02:00
bench : improve benchmarks
This commit is contained in:
parent
7765770f89
commit
d375d73b2e
@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
# Helper script to run the bench tool on all models and print the results in share-able format
|
# Helper script to run the bench tool on all models and print the results in share-able format
|
||||||
|
|
||||||
printf "Usage: ./bench.sh [n_threads]\n"
|
printf "Usage: ./bench.sh [n_threads] [encoder-only]\n"
|
||||||
|
|
||||||
if [ -z "$1" ]; then
|
if [ -z "$1" ]; then
|
||||||
n_threads=4
|
n_threads=4
|
||||||
@ -10,24 +10,39 @@ else
|
|||||||
n_threads=$1
|
n_threads=$1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
models=( "tiny" "base" "small" "medium" "large" )
|
encoder_only=0
|
||||||
|
if [ -z "$2" ]; then
|
||||||
|
encoder_only=0
|
||||||
|
else
|
||||||
|
encoder_only=$2
|
||||||
|
fi
|
||||||
|
|
||||||
printf "\n"
|
models=( \
|
||||||
printf "Running memcpy benchmark with 1 thread\n"
|
"tiny" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
|
||||||
printf "\n"
|
"base" "base-q5_0" "base-q5_1" "base-q8_0" \
|
||||||
|
"small" "small-q5_0" "small-q5_1" "small-q8_0" \
|
||||||
|
"medium" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
|
||||||
|
"large" "large-q5_0" "large-q5_1" "large-q8_0" \
|
||||||
|
)
|
||||||
|
|
||||||
./bench -w 1 -t 1 2>&1
|
if [ "$encoder_only" -eq 0 ]; then
|
||||||
|
printf "\n"
|
||||||
|
printf "Running memcpy benchmark\n"
|
||||||
|
printf "\n"
|
||||||
|
|
||||||
printf "\n"
|
./bench -w 1 -t $n_threads 2>&1
|
||||||
printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
|
|
||||||
printf "\n"
|
|
||||||
|
|
||||||
./bench -w 2 -t $n_threads 2>&1
|
printf "\n"
|
||||||
|
printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
|
||||||
|
printf "\n"
|
||||||
|
|
||||||
printf "\n"
|
./bench -w 2 -t $n_threads 2>&1
|
||||||
printf "Running benchmark for all models\n"
|
|
||||||
printf "This can take a while!\n"
|
printf "\n"
|
||||||
printf "\n"
|
printf "Running benchmark for all models\n"
|
||||||
|
printf "This can take a while!\n"
|
||||||
|
printf "\n"
|
||||||
|
fi
|
||||||
|
|
||||||
printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
|
printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
|
||||||
printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
|
printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
|
||||||
@ -39,6 +54,7 @@ for model in "${models[@]}"; do
|
|||||||
# actual run
|
# actual run
|
||||||
# store stderr output in a variable in order to parse it later
|
# store stderr output in a variable in order to parse it later
|
||||||
output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
|
output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
|
||||||
|
ret=$?
|
||||||
|
|
||||||
# parse the output:
|
# parse the output:
|
||||||
load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
|
load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
|
||||||
@ -70,5 +86,7 @@ for model in "${models[@]}"; do
|
|||||||
|
|
||||||
commit=$(git rev-parse --short HEAD)
|
commit=$(git rev-parse --short HEAD)
|
||||||
|
|
||||||
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
if [ $ret -eq 0 ]; then
|
||||||
|
printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
|
||||||
|
fi
|
||||||
done
|
done
|
||||||
|
106
whisper.cpp
106
whisper.cpp
@ -4827,48 +4827,50 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
|
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
size_t n = 50;
|
size_t n = 20;
|
||||||
size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
|
size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
|
||||||
|
|
||||||
// 1 GB array
|
// 1GB MB array
|
||||||
const size_t size = arr*1024llu*1024llu;
|
const size_t size = arr*1024llu*1024llu;
|
||||||
|
|
||||||
char * src = (char *) malloc(size);
|
// single-thread
|
||||||
char * dst = (char *) malloc(size);
|
|
||||||
|
|
||||||
for (size_t i = 0; i < size; i++) src[i] = i;
|
|
||||||
|
|
||||||
memcpy(dst, src, size); // heat-up
|
|
||||||
|
|
||||||
double tsum = 0.0;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < n; i++) {
|
|
||||||
const int64_t t0 = ggml_time_us();
|
|
||||||
|
|
||||||
memcpy(dst, src, size);
|
|
||||||
|
|
||||||
const int64_t t1 = ggml_time_us();
|
|
||||||
|
|
||||||
tsum += (t1 - t0)*1e-6;
|
|
||||||
|
|
||||||
src[0] = rand();
|
|
||||||
}
|
|
||||||
|
|
||||||
snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
|
||||||
s += strbuf;
|
|
||||||
|
|
||||||
// needed to prevent the compile from optimizing the memcpy away
|
|
||||||
{
|
{
|
||||||
double sum = 0.0;
|
char * src = (char *) malloc(size);
|
||||||
|
char * dst = (char *) malloc(size);
|
||||||
|
|
||||||
for (size_t i = 0; i < size; i++) sum += dst[i];
|
for (size_t i = 0; i < size; i++) src[i] = i;
|
||||||
|
|
||||||
snprintf(strbuf, sizeof(strbuf), "sum: %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
|
memcpy(dst, src, size); // heat-up
|
||||||
|
|
||||||
|
double tsum = 0.0;
|
||||||
|
double sum = 0.0;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < n; i++) {
|
||||||
|
const int64_t t0 = ggml_time_us();
|
||||||
|
|
||||||
|
memcpy(dst, src, size);
|
||||||
|
|
||||||
|
const int64_t t1 = ggml_time_us();
|
||||||
|
|
||||||
|
tsum += (t1 - t0)*1e-6;
|
||||||
|
|
||||||
|
src[rand() % size] = rand() % 256;
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
|
||||||
s += strbuf;
|
s += strbuf;
|
||||||
}
|
|
||||||
|
|
||||||
free(src);
|
// needed to prevent the compiler from optimizing the memcpy away
|
||||||
free(dst);
|
{
|
||||||
|
for (size_t i = 0; i < size; i++) sum += dst[i];
|
||||||
|
|
||||||
|
snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum);
|
||||||
|
s += strbuf;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(src);
|
||||||
|
free(dst);
|
||||||
|
}
|
||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
@ -4905,26 +4907,37 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|||||||
for (int j = 0; j < (int) sizes.size(); j++) {
|
for (int j = 0; j < (int) sizes.size(); j++) {
|
||||||
int n_q4_0 = 0;
|
int n_q4_0 = 0;
|
||||||
int n_q4_1 = 0;
|
int n_q4_1 = 0;
|
||||||
|
int n_q4_2 = 0;
|
||||||
|
int n_q5_0 = 0;
|
||||||
|
int n_q5_1 = 0;
|
||||||
|
int n_q8_0 = 0;
|
||||||
int n_fp16 = 0;
|
int n_fp16 = 0;
|
||||||
int n_fp32 = 0;
|
int n_fp32 = 0;
|
||||||
|
|
||||||
// GFLOPS/s
|
// GFLOPS/s
|
||||||
double s_q4_0 = 0.0;
|
double s_q4_0 = 0.0;
|
||||||
double s_q4_1 = 0.0;
|
double s_q4_1 = 0.0;
|
||||||
|
double s_q4_2 = 0.0;
|
||||||
|
double s_q5_0 = 0.0;
|
||||||
|
double s_q5_1 = 0.0;
|
||||||
|
double s_q8_0 = 0.0;
|
||||||
double s_fp16 = 0.0;
|
double s_fp16 = 0.0;
|
||||||
double s_fp32 = 0.0;
|
double s_fp32 = 0.0;
|
||||||
|
|
||||||
const size_t N = sizes[j];
|
const size_t N = sizes[j];
|
||||||
|
|
||||||
for (int k = 0; k < 4; ++k) {
|
for (int k = 0; k < 8; ++k) {
|
||||||
const ggml_type wtype =
|
const ggml_type wtype =
|
||||||
k == 0 ? GGML_TYPE_Q4_0 :
|
k == 0 ? GGML_TYPE_Q4_0 :
|
||||||
k == 1 ? GGML_TYPE_Q4_1 :
|
k == 1 ? GGML_TYPE_Q4_1 :
|
||||||
k == 2 ? GGML_TYPE_F16 :
|
k == 2 ? GGML_TYPE_Q4_2 :
|
||||||
GGML_TYPE_F32;
|
k == 3 ? GGML_TYPE_Q5_0 :
|
||||||
|
k == 4 ? GGML_TYPE_Q5_1 :
|
||||||
|
k == 5 ? GGML_TYPE_Q8_0 :
|
||||||
|
k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||||
|
|
||||||
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
|
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
|
||||||
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
|
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
|
||||||
|
|
||||||
struct ggml_init_params gparams = {
|
struct ggml_init_params gparams = {
|
||||||
/*.mem_size =*/ buf.size(),
|
/*.mem_size =*/ buf.size(),
|
||||||
@ -4968,8 +4981,19 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
|
|||||||
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
s = ((2.0*N*N*N*n)/tsum)*1e-9;
|
||||||
}
|
}
|
||||||
|
|
||||||
snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
|
// Q4_0 | Q4_1 | Q4_2
|
||||||
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
|
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
|
||||||
|
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
|
||||||
|
s += strbuf;
|
||||||
|
|
||||||
|
// Q5_0 | Q5_1 | Q8_0
|
||||||
|
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
|
||||||
|
N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
|
||||||
|
s += strbuf;
|
||||||
|
|
||||||
|
// F16 | F32
|
||||||
|
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs)\n",
|
||||||
|
N, N, s_fp16, n_fp16, s_fp32, n_fp32);
|
||||||
s += strbuf;
|
s += strbuf;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user