diff --git a/extra/bench-all.sh b/extra/bench-all.sh index 92973786..43f989db 100755 --- a/extra/bench-all.sh +++ b/extra/bench-all.sh @@ -2,7 +2,7 @@ # Helper script to run the bench tool on all models and print the results in share-able format -printf "Usage: ./bench.sh [n_threads]\n" +printf "Usage: ./bench.sh [n_threads] [encoder-only]\n" if [ -z "$1" ]; then n_threads=4 @@ -10,24 +10,39 @@ else n_threads=$1 fi -models=( "tiny" "base" "small" "medium" "large" ) +encoder_only=0 +if [ -z "$2" ]; then + encoder_only=0 +else + encoder_only=$2 +fi -printf "\n" -printf "Running memcpy benchmark with 1 thread\n" -printf "\n" +models=( \ + "tiny" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \ + "base" "base-q5_0" "base-q5_1" "base-q8_0" \ + "small" "small-q5_0" "small-q5_1" "small-q8_0" \ + "medium" "medium-q5_0" "medium-q5_1" "medium-q8_0" \ + "large" "large-q5_0" "large-q5_1" "large-q8_0" \ +) -./bench -w 1 -t 1 2>&1 +if [ "$encoder_only" -eq 0 ]; then + printf "\n" + printf "Running memcpy benchmark\n" + printf "\n" -printf "\n" -printf "Running ggml_mul_mat benchmark with $n_threads threads\n" -printf "\n" + ./bench -w 1 -t $n_threads 2>&1 -./bench -w 2 -t $n_threads 2>&1 + printf "\n" + printf "Running ggml_mul_mat benchmark with $n_threads threads\n" + printf "\n" -printf "\n" -printf "Running benchmark for all models\n" -printf "This can take a while!\n" -printf "\n" + ./bench -w 2 -t $n_threads 2>&1 + + printf "\n" + printf "Running benchmark for all models\n" + printf "This can take a while!\n" + printf "\n" +fi printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n" printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n" @@ -39,6 +54,7 @@ for model in "${models[@]}"; do # actual run # store stderr output in a variable in order to parse it later output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1) + ret=$? # parse the output: load_time=$(echo "$output" | grep "load time" | awk '{print $5}') @@ -70,5 +86,7 @@ for model in "${models[@]}"; do commit=$(git rev-parse --short HEAD) - printf "| | | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n" + if [ $ret -eq 0 ]; then + printf "| | | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n" + fi done diff --git a/whisper.cpp b/whisper.cpp index cca949fa..df283ec9 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -4827,48 +4827,50 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { ggml_time_init(); - size_t n = 50; - size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations + size_t n = 20; + size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations - // 1 GB array + // 1GB MB array const size_t size = arr*1024llu*1024llu; - char * src = (char *) malloc(size); - char * dst = (char *) malloc(size); - - for (size_t i = 0; i < size; i++) src[i] = i; - - memcpy(dst, src, size); // heat-up - - double tsum = 0.0; - - for (size_t i = 0; i < n; i++) { - const int64_t t0 = ggml_time_us(); - - memcpy(dst, src, size); - - const int64_t t1 = ggml_time_us(); - - tsum += (t1 - t0)*1e-6; - - src[0] = rand(); - } - - snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu)); - s += strbuf; - - // needed to prevent the compile from optimizing the memcpy away + // single-thread { - double sum = 0.0; + char * src = (char *) malloc(size); + char * dst = (char *) malloc(size); - for (size_t i = 0; i < size; i++) sum += dst[i]; + for (size_t i = 0; i < size; i++) src[i] = i; - snprintf(strbuf, sizeof(strbuf), "sum: %s %f\n", sum == -536870910.00 ? "ok" : "error", sum); + memcpy(dst, src, size); // heat-up + + double tsum = 0.0; + double sum = 0.0; + + for (size_t i = 0; i < n; i++) { + const int64_t t0 = ggml_time_us(); + + memcpy(dst, src, size); + + const int64_t t1 = ggml_time_us(); + + tsum += (t1 - t0)*1e-6; + + src[rand() % size] = rand() % 256; + } + + snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu)); s += strbuf; - } - free(src); - free(dst); + // needed to prevent the compiler from optimizing the memcpy away + { + for (size_t i = 0; i < size; i++) sum += dst[i]; + + snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum); + s += strbuf; + } + + free(src); + free(dst); + } return s.c_str(); } @@ -4905,26 +4907,37 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { for (int j = 0; j < (int) sizes.size(); j++) { int n_q4_0 = 0; int n_q4_1 = 0; + int n_q4_2 = 0; + int n_q5_0 = 0; + int n_q5_1 = 0; + int n_q8_0 = 0; int n_fp16 = 0; int n_fp32 = 0; // GFLOPS/s double s_q4_0 = 0.0; double s_q4_1 = 0.0; + double s_q4_2 = 0.0; + double s_q5_0 = 0.0; + double s_q5_1 = 0.0; + double s_q8_0 = 0.0; double s_fp16 = 0.0; double s_fp32 = 0.0; const size_t N = sizes[j]; - for (int k = 0; k < 4; ++k) { + for (int k = 0; k < 8; ++k) { const ggml_type wtype = k == 0 ? GGML_TYPE_Q4_0 : k == 1 ? GGML_TYPE_Q4_1 : - k == 2 ? GGML_TYPE_F16 : - GGML_TYPE_F32; + k == 2 ? GGML_TYPE_Q4_2 : + k == 3 ? GGML_TYPE_Q5_0 : + k == 4 ? GGML_TYPE_Q5_1 : + k == 5 ? GGML_TYPE_Q8_0 : + k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32; - double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32; - int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32; + double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32; + int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32; struct ggml_init_params gparams = { /*.mem_size =*/ buf.size(), @@ -4968,8 +4981,19 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { s = ((2.0*N*N*N*n)/tsum)*1e-9; } - snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n", - N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32); + // Q4_0 | Q4_1 | Q4_2 + snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2); + s += strbuf; + + // Q5_0 | Q5_1 | Q8_0 + snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0); + s += strbuf; + + // F16 | F32 + snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs)\n", + N, N, s_fp16, n_fp16, s_fp32, n_fp32); s += strbuf; }