Compare commits

...

23 Commits

Author SHA1 Message Date
ee2971bf6a bench : multi-thread memcpy 2023-11-21 21:57:07 +02:00
eff3570f78 server : add a REST Whisper server example with OAI-like API (#1380)
* Add first draft of server

* Added json support and base funcs for server.cpp

* Add more user input via api-request

also some clean up

* Add reqest params and load post function

Also some general clean up

* Remove unused function

* Add readme

* Add exception handlers

* Update examples/server/server.cpp

* make : add server target

* Add magic curl syntax

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-11-20 21:40:24 +02:00
fa19bc4195 whisper : update example in whisper.h (#1529)
update the example in the header, previous examples deprecated.
2023-11-20 20:52:27 +02:00
a01b2e0971 sdl : fix audio callback (#1523) 2023-11-20 13:16:38 +02:00
8159a9ab99 whisper : reuse whisper_decode_with_state (#1521) 2023-11-20 13:16:11 +02:00
7516d9c16d ci : redistribute CUDA DLLs (#1522)
see https://docs.nvidia.com/cuda/eula/index.html#attachment-a
2023-11-19 12:43:22 +02:00
46cc26d1b9 whisper : fix with_state methods to use the correct state (#1519)
Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com>
2023-11-19 11:25:30 +02:00
f784f9fa12 whisper : fix overriding the audio context 2023-11-19 10:32:32 +02:00
ca23f8ee6d cuda : assert ggml_add sources to be contiguous 2023-11-19 10:32:08 +02:00
e2f0eba2d4 ios : sync submodule 2023-11-17 10:42:04 +02:00
d4353e48f7 sync : ggml (ggml-alloc + linker + gguf fixes) (#1501) 2023-11-17 10:00:07 +02:00
bebf0da983 quantize : add support for K-quant types 2023-11-16 16:18:24 +02:00
848e54f3ad bench : fix memcpy bench size 2023-11-16 10:59:32 +02:00
7883d1cae4 talk-llama : improve quote and backtick handling (#1364)
* ISSUE-1329: replace " with ' so it doesn't try to execute code in backticks.

* Typo

* Update to keep possessives in the output

Closes the ' then puts a ' in quotes then reopens the ' to escape the ' characters.
2023-11-16 10:34:05 +02:00
ccc85b4ff8 talk-llama : enable GPU by default 2023-11-15 21:33:00 +02:00
c7606b47df models : add info about distilled models 2023-11-15 21:10:13 +02:00
d38af151a1 release : v1.5.0 2023-11-15 21:02:52 +02:00
94267df08e bench-all : add distil models 2023-11-15 20:49:12 +02:00
8713c67133 js : latest whisper.js 2023-11-15 20:10:16 +02:00
57a60639bb bench-all : indentations 2023-11-15 20:01:15 +02:00
bfbaa4dce5 whisper : make large version explicit + fix data size units (#1493) 2023-11-15 19:42:25 +02:00
1d79e78402 java : fix test (#1492) 2023-11-15 17:42:53 +02:00
b6c5f49b78 whisper : add batched decoding (#1486)
* whisper : add whisper_batch

* whisper : move kv_self to whisper_state

* whisper : full batched decoding support

* whisper : fix memory leak in whisper_batch

* whisper : fix mem leak again + remove oboslete function

* whisper : clear kv cache when using whisper_decode API

* whisper : speed-up sampling

* whisper : fix decoders initializer

* bench : add batch size 5 bench

* whisper : add comment about the KV cache size

* whisper : add check for max number of decoders

* whisper : avoid starting sampling threads with bs=1

* whisper : enable beam-search by default

* cuda : sync llama.cpp fixes
2023-11-15 16:12:52 +02:00
43 changed files with 35821 additions and 986 deletions

View File

@ -320,6 +320,13 @@ jobs:
cd ./build cd ./build
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
- name: Copy CUDA DLLs
run: >
Copy-Item -PassThru
-Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
-Include cudart64_*,cublas64_*,cublasLt64_*
-Destination build/bin/${{ matrix.build }}
- name: Copy SDL2.dll - name: Copy SDL2.dll
if: matrix.sdl2 == 'ON' if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }} run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

1
.gitignore vendored
View File

@ -31,6 +31,7 @@ build-sanitize-thread/
/talk-llama /talk-llama
/bench /bench
/quantize /quantize
/server
/lsp /lsp
arm_neon.h arm_neon.h

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.5) cmake_minimum_required (VERSION 3.5)
project(whisper.cpp VERSION 1.4.3) project(whisper.cpp VERSION 1.5.0)
# Add path to modules # Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")

View File

@ -1,4 +1,4 @@
default: main bench quantize default: main bench quantize server
ifndef UNAME_S ifndef UNAME_S
UNAME_S := $(shell uname -s) UNAME_S := $(shell uname -s)
@ -338,7 +338,7 @@ libwhisper.so: $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)
clean: clean:
rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
# #
# Examples # Examples
@ -359,6 +359,9 @@ bench: examples/bench/bench.cpp $(WHISPER_OBJ)
quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON) quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)
server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
@ -418,9 +421,9 @@ samples:
.PHONY: medium .PHONY: medium
.PHONY: large-v1 .PHONY: large-v1
.PHONY: large-v2 .PHONY: large-v2
.PHONY: large .PHONY: large-v3
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
bash ./models/download-ggml-model.sh $@ bash ./models/download-ggml-model.sh $@
@echo "" @echo ""
@echo "===============================================" @echo "==============================================="

View File

@ -6,7 +6,7 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/) [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Beta: [v1.4.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.3) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126) Stable: [v1.5.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model: High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -231,18 +231,18 @@ make medium.en
make medium make medium
make large-v1 make large-v1
make large-v2 make large-v2
make large make large-v3
``` ```
## Memory usage ## Memory usage
| Model | Disk | Mem | SHA | | Model | Disk | Mem |
| --- | --- | --- | --- | | --- | --- | --- |
| tiny | 75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` | | tiny | 75 MiB | ~273 MB |
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` | | base | 142 MiB | ~388 MB |
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` | | small | 466 MiB | ~852 MB |
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` | | medium | 1.5 GiB | ~2.1 GB |
| large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` | | large | 2.9 GiB | ~3.9 GB |
## Quantization ## Quantization

View File

@ -24,7 +24,7 @@ const (
var ( var (
// The models which will be downloaded, if no model is specified as an argument // The models which will be downloaded, if no model is specified as an argument
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"} modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
) )
var ( var (

View File

@ -45,7 +45,7 @@ class WhisperCppTest {
assertEquals(16384, params.n_max_text_ctx); assertEquals(16384, params.n_max_text_ctx);
assertFalse(params.translate); assertFalse(params.translate);
assertEquals(0.01f, params.thold_pt); assertEquals(0.01f, params.thold_pt);
assertEquals(2, params.beam_search.beam_size); assertEquals(5, params.beam_search.beam_size);
assertEquals(-1.0f, params.beam_search.patience); assertEquals(-1.0f, params.beam_search.patience);
} }
@ -58,7 +58,7 @@ class WhisperCppTest {
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy); assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
assertNotEquals(0, params.n_threads); assertNotEquals(0, params.n_threads);
assertEquals(16384, params.n_max_text_ctx); assertEquals(16384, params.n_max_text_ctx);
assertEquals(2, params.greedy.best_of); assertEquals(5, params.greedy.best_of);
} }
@Test @Test

View File

@ -1,6 +1,6 @@
{ {
"name": "whisper.cpp", "name": "whisper.cpp",
"version": "1.4.3", "version": "1.5.0",
"description": "Whisper speech recognition", "description": "Whisper speech recognition",
"main": "whisper.js", "main": "whisper.js",
"scripts": { "scripts": {

File diff suppressed because one or more lines are too long

View File

@ -65,6 +65,7 @@ elseif(CMAKE_JS_VERSION)
else() else()
add_subdirectory(main) add_subdirectory(main)
add_subdirectory(stream) add_subdirectory(stream)
add_subdirectory(server)
add_subdirectory(command) add_subdirectory(command)
add_subdirectory(bench) add_subdirectory(bench)
add_subdirectory(quantize) add_subdirectory(quantize)

View File

@ -81,7 +81,7 @@ int whisper_bench_full(const whisper_params & params) {
} }
// heat encoder // heat encoder
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) { if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret); fprintf(stderr, "error: failed to encode: %d\n", ret);
return 4; return 4;
} }
@ -90,13 +90,13 @@ int whisper_bench_full(const whisper_params & params) {
// prompt heat // prompt heat
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) { if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret); fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4; return 4;
} }
// text-generation heat // text-generation heat
if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) { if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret); fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4; return 4;
} }
@ -104,20 +104,30 @@ int whisper_bench_full(const whisper_params & params) {
// actual run // actual run
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) { if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret); fprintf(stderr, "error: failed to encode: %d\n", ret);
return 4; return 4;
} }
for (int i = 0; i < 16; i++) { // text-generation
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) { for (int i = 0; i < 256; i++) {
fprintf(stderr, "error: failed to encode model: %d\n", ret); if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4; return 4;
} }
} }
for (int i = 0; i < 256; i++) { // batched decoding
if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) { for (int i = 0; i < 64; i++) {
fprintf(stderr, "error: failed to encode model: %d\n", ret); if (int ret = whisper_decode(ctx, tokens, 5, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4;
}
}
// prompt processing
for (int i = 0; i < 16; i++) {
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to decode: %d\n", ret);
return 4; return 4;
} }
} }

View File

@ -9,6 +9,11 @@ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0}, {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1}, {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0}, {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
}; };
void ggml_print_ftypes(FILE * fp) { void ggml_print_ftypes(FILE * fp) {
@ -48,15 +53,15 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break; case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break; case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break; case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
case GGML_FTYPE_UNKNOWN: case GGML_FTYPE_UNKNOWN:
case GGML_FTYPE_ALL_F32: case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16: case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
case GGML_FTYPE_MOSTLY_Q2_K:
case GGML_FTYPE_MOSTLY_Q3_K:
case GGML_FTYPE_MOSTLY_Q4_K:
case GGML_FTYPE_MOSTLY_Q5_K:
case GGML_FTYPE_MOSTLY_Q6_K:
{ {
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false; return false;
@ -167,24 +172,17 @@ bool ggml_common_quantize_0(
switch ((ggml_type) ttype) { switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_1:
{
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_0:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
{ {
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
} break; } break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
case GGML_TYPE_F16: case GGML_TYPE_F16:
@ -192,11 +190,6 @@ bool ggml_common_quantize_0(
case GGML_TYPE_I16: case GGML_TYPE_I16:
case GGML_TYPE_I32: case GGML_TYPE_I32:
case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_1:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_Q8_K: case GGML_TYPE_Q8_K:
case GGML_TYPE_COUNT: case GGML_TYPE_COUNT:
{ {

View File

@ -139,10 +139,13 @@ void audio_async::callback(uint8_t * stream, int len) {
return; return;
} }
const size_t n_samples = len / sizeof(float); size_t n_samples = len / sizeof(float);
m_audio_new.resize(n_samples); if (n_samples > m_audio.size()) {
memcpy(m_audio_new.data(), stream, n_samples * sizeof(float)); n_samples = m_audio.size();
stream += (len - (n_samples * sizeof(float)));
}
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len); //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
@ -153,7 +156,7 @@ void audio_async::callback(uint8_t * stream, int len) {
const size_t n0 = m_audio.size() - m_audio_pos; const size_t n0 = m_audio.size() - m_audio_pos;
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float)); memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float)); memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size(); m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = m_audio.size(); m_audio_len = m_audio.size();

View File

@ -41,7 +41,6 @@ private:
std::mutex m_mutex; std::mutex m_mutex;
std::vector<float> m_audio; std::vector<float> m_audio;
std::vector<float> m_audio_new;
size_t m_audio_pos = 0; size_t m_audio_pos = 0;
size_t m_audio_len = 0; size_t m_audio_len = 0;
}; };

View File

@ -48,7 +48,7 @@ if [ -n "$3" ]; then
fi fi
# Whisper models # Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
# list available models # list available models
function list_models { function list_models {

View File

@ -62,8 +62,8 @@ struct whisper_params {
int32_t progress_step = 5; int32_t progress_step = 5;
int32_t max_context = -1; int32_t max_context = -1;
int32_t max_len = 0; int32_t max_len = 0;
int32_t best_of = 2; int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
int32_t beam_size = -1; int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
float word_thold = 0.01f; float word_thold = 0.01f;
float entropy_thold = 2.40f; float entropy_thold = 2.40f;
@ -165,8 +165,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); } else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; } else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; } else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; } else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else { else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params); whisper_print_usage(argc, argv, params);
@ -925,9 +925,9 @@ int main(int argc, char ** argv) {
if (params.detect_language) { if (params.detect_language) {
params.language = "auto"; params.language = "auto";
} }
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n", fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, %d beams + best of %d, lang = %s, task = %s, %stimestamps = %d ...\n",
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
params.n_threads, params.n_processors, params.n_threads, params.n_processors, params.beam_size, params.best_of,
params.language.c_str(), params.language.c_str(),
params.translate ? "translate" : "transcribe", params.translate ? "translate" : "transcribe",
params.tinydiarize ? "tdrz = 1, " : "", params.tinydiarize ? "tdrz = 1, " : "",

View File

@ -0,0 +1,6 @@
set(TARGET server)
add_executable(${TARGET} server.cpp httplib.h json.hpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})

59
examples/server/README.md Normal file
View File

@ -0,0 +1,59 @@
# whisper.cpp http server
Simple http server. WAV Files are passed to the inference model via http requests.
```
./server -h
usage: ./bin/server [options]
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-sow, --split-on-word [false ] split on word rather than on token
-bo N, --best-of N [2 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
-tr, --translate [false ] translate from source language to english
-di, --diarize [false ] stereo audio diarization
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [false ] do not print timestamps
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
-dl, --detect-language [false ] exit after automatically detecting language
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
--port PORT, [8080 ] Port number for the server
```
## request examples
**/inference**
```
curl 127.0.0.1:8080/inference \
-H "Content-Type: multipart/form-data" \
-F file="@<file-path>" \
-F temperature="0.2" \
-F response-format="json"
```
**/load**
```
curl 127.0.0.1:8080/load \
-H "Content-Type: multipart/form-data" \
-F model="<path-to-model-file>"
```

9262
examples/server/httplib.h Normal file

File diff suppressed because it is too large Load Diff

24596
examples/server/json.hpp Normal file

File diff suppressed because it is too large Load Diff

699
examples/server/server.cpp Normal file
View File

@ -0,0 +1,699 @@
#include "common.h"
#include "whisper.h"
#include "httplib.h"
#include "json.hpp"
#include <cmath>
#include <fstream>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <cstring>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
using namespace httplib;
using json = nlohmann::json;
namespace {
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
// Lowest is red, middle is yellow, highest is green.
const std::vector<std::string> k_colors = {
"\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
"\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
};
// output formats
const std::string json_format = "json";
const std::string text_format = "text";
const std::string srt_format = "srt";
const std::string vjson_format = "verbose_json";
const std::string vtt_format = "vtt";
struct server_params
{
std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public";
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
};
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
int32_t duration_ms = 0;
int32_t progress_step = 5;
int32_t max_context = -1;
int32_t max_len = 0;
int32_t best_of = 2;
int32_t beam_size = -1;
float word_thold = 0.01f;
float entropy_thold = 2.40f;
float logprob_thold = -1.00f;
float userdef_temp = 0.20f;
bool speed_up = false;
bool debug_mode = false;
bool translate = false;
bool detect_language = false;
bool diarize = false;
bool tinydiarize = false;
bool split_on_word = false;
bool no_fallback = false;
bool print_special = false;
bool print_colors = false;
bool print_progress = false;
bool no_timestamps = false;
bool use_gpu = true;
std::string language = "en";
std::string prompt = "";
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
std::string model = "models/ggml-base.en.bin";
std::string response_format = json_format;
// [TDRZ] speaker turn string
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
std::string openvino_encode_device = "CPU";
};
// 500 -> 00:05.000
// 6000 -> 01:00.000
std::string to_timestamp(int64_t t, bool comma = false) {
int64_t msec = t * 10;
int64_t hr = msec / (1000 * 60 * 60);
msec = msec - hr * (1000 * 60 * 60);
int64_t min = msec / (1000 * 60);
msec = msec - min * (1000 * 60);
int64_t sec = msec / 1000;
msec = msec - sec * 1000;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
return std::string(buf);
}
int timestamp_to_sample(int64_t t, int n_samples) {
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
}
bool is_file_exist(const char *fileName)
{
std::ifstream infile(fileName);
return infile.good();
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params,
const server_params& sparams) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options] \n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
// server params
fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
fprintf(stderr, "\n");
}
bool whisper_params_parse(int argc, char ** argv, whisper_params & params, server_params & sparams) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; }
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
// server params
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
}
return true;
}
struct whisper_print_user_data {
const whisper_params * params;
const std::vector<std::vector<float>> * pcmf32s;
int progress_prev;
};
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
std::string speaker = "";
const int64_t n_samples = pcmf32s[0].size();
const int64_t is0 = timestamp_to_sample(t0, n_samples);
const int64_t is1 = timestamp_to_sample(t1, n_samples);
double energy0 = 0.0f;
double energy1 = 0.0f;
for (int64_t j = is0; j < is1; j++) {
energy0 += fabs(pcmf32s[0][j]);
energy1 += fabs(pcmf32s[1][j]);
}
if (energy0 > 1.1*energy1) {
speaker = "0";
} else if (energy1 > 1.1*energy0) {
speaker = "1";
} else {
speaker = "?";
}
//printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
if (!id_only) {
speaker.insert(0, "(speaker ");
speaker.append(")");
}
return speaker;
}
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
if (progress >= *progress_prev + progress_step) {
*progress_prev += progress_step;
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
}
}
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
const auto & params = *((whisper_print_user_data *) user_data)->params;
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
const int n_segments = whisper_full_n_segments(ctx);
std::string speaker = "";
int64_t t0 = 0;
int64_t t1 = 0;
// print the last n_new segments
const int s0 = n_segments - n_new;
if (s0 == 0) {
printf("\n");
}
for (int i = s0; i < n_segments; i++) {
if (!params.no_timestamps || params.diarize) {
t0 = whisper_full_get_segment_t0(ctx, i);
t1 = whisper_full_get_segment_t1(ctx, i);
}
if (!params.no_timestamps) {
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
}
if (params.diarize && pcmf32s.size() == 2) {
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
}
if (params.print_colors) {
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
if (params.print_special == false) {
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
if (id >= whisper_token_eot(ctx)) {
continue;
}
}
const char * text = whisper_full_get_token_text(ctx, i, j);
const float p = whisper_full_get_token_p (ctx, i, j);
const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
}
} else {
const char * text = whisper_full_get_segment_text(ctx, i);
printf("%s%s", speaker.c_str(), text);
}
if (params.tinydiarize) {
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
printf("%s", params.tdrz_speaker_turn.c_str());
}
}
// with timestamps or speakers: each segment on new line
if (!params.no_timestamps || params.diarize) {
printf("\n");
}
fflush(stdout);
}
}
std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::stringstream result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
std::string speaker = "";
if (params.diarize && pcmf32s.size() == 2)
{
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
}
result << speaker << text << "\n";
}
return result.str();
}
void get_req_parameters(const Request & req, whisper_params & params)
{
// user model configu.has_fileion
if (req.has_file("offset-t"))
{
params.offset_t_ms = std::stoi(req.get_file_value("offset-t").content);
}
if (req.has_file("offset-n"))
{
params.offset_n = std::stoi(req.get_file_value("offset-n").content);
}
if (req.has_file("duration"))
{
params.duration_ms = std::stoi(req.get_file_value("duration").content);
}
if (req.has_file("max-context"))
{
params.max_context = std::stoi(req.get_file_value("max-context").content);
}
if (req.has_file("prompt"))
{
params.prompt = req.get_file_value("prompt").content;
}
if (req.has_file("response-format"))
{
params.response_format = req.get_file_value("response-format").content;
}
if (req.has_file("temerature"))
{
params.userdef_temp = std::stof(req.get_file_value("temperature").content);
}
}
} // namespace
int main(int argc, char ** argv) {
whisper_params params;
server_params sparams;
std::mutex whisper_mutex;
if (whisper_params_parse(argc, argv, params, sparams) == false) {
whisper_print_usage(argc, argv, params, sparams);
return 1;
}
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
if (params.diarize && params.tinydiarize) {
fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
whisper_print_usage(argc, argv, params, sparams);
exit(0);
}
// whisper init
struct whisper_context_params cparams;
cparams.use_gpu = params.use_gpu;
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
if (ctx == nullptr) {
fprintf(stderr, "error: failed to initialize whisper context\n");
return 3;
}
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
Server svr;
std::string const default_content = "<html>hello</html>";
// this is only called if no index.html is found in the public --path
svr.Get("/", [&default_content](const Request &, Response &res){
res.set_content(default_content, "text/html");
return false;
});
svr.Post("/inference", [&](const Request &req, Response &res){
// aquire whisper model mutex lock
whisper_mutex.lock();
// first check user requested fields of the request
if (!req.has_file("file"))
{
fprintf(stderr, "error: no 'file' field in the request\n");
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
auto audio_file = req.get_file_value("file");
// check non-required fields
get_req_parameters(req, params);
std::string filename{audio_file.filename};
printf("Received request: %s\n", filename.c_str());
// audio arrays
std::vector<float> pcmf32; // mono-channel F32 PCM
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
// write file to temporary file
std::ofstream temp_file{filename, std::ios::binary};
temp_file << audio_file.content;
// read wav content into pcmf32
if (!::read_wav(filename, pcmf32, pcmf32s, params.diarize)) {
fprintf(stderr, "error: failed to read WAV file '%s'\n", filename.c_str());
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
// remove temp file
std::remove(filename.c_str());
printf("Successfully loaded %s\n", filename.c_str());
// print system information
{
fprintf(stderr, "\n");
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
}
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
if (params.detect_language) {
params.language = "auto";
}
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
__func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
params.n_threads, params.n_processors,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.tinydiarize ? "tdrz = 1, " : "",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// run the inference
{
printf("Running whisper.cpp inference on %s\n", filename.c_str());
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
wparams.print_realtime = false;
wparams.print_progress = params.print_progress;
wparams.print_timestamps = !params.no_timestamps;
wparams.print_special = params.print_special;
wparams.translate = params.translate;
wparams.language = params.language.c_str();
wparams.detect_language = params.detect_language;
wparams.n_threads = params.n_threads;
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
wparams.offset_ms = params.offset_t_ms;
wparams.duration_ms = params.duration_ms;
wparams.thold_pt = params.word_thold;
wparams.split_on_word = params.split_on_word;
wparams.speed_up = params.speed_up;
wparams.debug_mode = params.debug_mode;
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
wparams.initial_prompt = params.prompt.c_str();
wparams.greedy.best_of = params.best_of;
wparams.beam_search.beam_size = params.beam_size;
wparams.temperature_inc = params.userdef_temp;
wparams.entropy_thold = params.entropy_thold;
wparams.logprob_thold = params.logprob_thold;
whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
// this callback is called on each new segment
if (!wparams.print_realtime) {
wparams.new_segment_callback = whisper_print_segment_callback;
wparams.new_segment_callback_user_data = &user_data;
}
if (wparams.print_progress) {
wparams.progress_callback = whisper_print_progress_callback;
wparams.progress_callback_user_data = &user_data;
}
// examples for abort mechanism
// in examples below, we do not abort the processing, but we could if the flag is set to true
// the callback is called before every encoder run - if it returns false, the processing is aborted
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
bool is_aborted = *(bool*)user_data;
return !is_aborted;
};
wparams.encoder_begin_callback_user_data = &is_aborted;
}
// the callback is called before every computation - if it returns true, the computation is aborted
{
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.abort_callback = [](void * user_data) {
bool is_aborted = *(bool*)user_data;
return is_aborted;
};
wparams.abort_callback_user_data = &is_aborted;
}
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
}
// return results to user
if (params.response_format == text_format)
{
std::string results = output_str(ctx, params, pcmf32s);
res.set_content(results.c_str(), "text/html");
}
// TODO add more output formats
else
{
std::string results = output_str(ctx, params, pcmf32s);
json jres = json{
{"text", results}
};
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
"application/json");
}
// return whisper model mutex lock
whisper_mutex.unlock();
});
svr.Post("/load", [&](const Request &req, Response &res){
whisper_mutex.lock();
if (!req.has_file("model"))
{
fprintf(stderr, "error: no 'model' field in the request\n");
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
std::string model = req.get_file_value("model").content;
if (!is_file_exist(model.c_str()))
{
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
const std::string error_resp = "{\"error\":\"model not found!\"}";
res.set_content(error_resp, "application/json");
whisper_mutex.unlock();
return;
}
// clean up
whisper_free(ctx);
// whisper init
ctx = whisper_init_from_file_with_params(model.c_str(), cparams);
// TODO perhaps load prior model here instead of exit
if (ctx == nullptr) {
fprintf(stderr, "error: model init failed, no model loaded must exit\n");
exit(1);
}
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
const std::string success = "Load was successful!";
res.set_content(success, "application/text");
// check if the model is in the file system
whisper_mutex.unlock();
});
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
const char fmt[] = "500 Internal Server Error\n%s";
char buf[BUFSIZ];
try {
std::rethrow_exception(std::move(ep));
} catch (std::exception &e) {
snprintf(buf, sizeof(buf), fmt, e.what());
} catch (...) {
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
}
res.set_content(buf, "text/plain");
res.status = 500;
});
svr.set_error_handler([](const Request &, Response &res) {
if (res.status == 400) {
res.set_content("Invalid request", "text/plain");
} else if (res.status != 500) {
res.set_content("File Not Found", "text/plain");
res.status = 404;
}
});
// set timeouts and change hostname and port
svr.set_read_timeout(sparams.read_timeout);
svr.set_write_timeout(sparams.write_timeout);
if (!svr.bind_to_port(sparams.hostname, sparams.port))
{
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
sparams.hostname.c_str(), sparams.port);
return 1;
}
// Set the base directory for serving static files
svr.set_base_dir(sparams.public_path);
// to make it ctrl+clickable:
printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
if (!svr.listen_after_bind())
{
return 1;
}
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}

View File

@ -53,7 +53,7 @@ struct whisper_params {
int32_t capture_id = -1; int32_t capture_id = -1;
int32_t max_tokens = 32; int32_t max_tokens = 32;
int32_t audio_ctx = 0; int32_t audio_ctx = 0;
int32_t n_gpu_layers = 0; int32_t n_gpu_layers = 999;
float vad_thold = 0.6f; float vad_thold = 0.6f;
float freq_thold = 100.0f; float freq_thold = 100.0f;
@ -136,7 +136,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id); fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens); fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx); fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7s] number of layers to store in VRAM\n", params.n_gpu_layers); fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold); fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold); fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
@ -686,8 +686,8 @@ int main(int argc, char ** argv) {
} }
} }
text_to_speak = ::replace(text_to_speak, "\"", ""); text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str()); int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
if (ret != 0) { if (ret != 0) {
fprintf(stderr, "%s: failed to speak\n", __func__); fprintf(stderr, "%s: failed to speak\n", __func__);
} }

View File

@ -21,7 +21,7 @@ help()
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]" echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
echo "options:" echo "options:"
echo "-s Step in seconds (default is $step)." echo "-s Step in seconds (default is $step)."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')." echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
echo "-t Number of threads to use." echo "-t Number of threads to use."
echo "-h Print this help page." echo "-h Print this help page."
echo echo

View File

@ -17,12 +17,12 @@ else
encoder_only=$2 encoder_only=$2
fi fi
models=( \ models=( \
"tiny" "tiny-q4_0" "tiny-q4_1" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \ "tiny" "tiny-q4_0" "tiny-q4_1" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
"base" "base-q4_0" "base-q4_1" "base-q5_0" "base-q5_1" "base-q8_0" \ "base" "base-q4_0" "base-q4_1" "base-q5_0" "base-q5_1" "base-q8_0" \
"small" "small-q4_0" "small-q4_1" "small-q5_0" "small-q5_1" "small-q8_0" \ "small" "small-q4_0" "small-q4_1" "small-q5_0" "small-q5_1" "small-q8_0" \
"medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" \ "medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" "medium-dis" \
"large" "large-q4_0" "large-q4_1" "large-q5_0" "large-q5_1" "large-q8_0" \ "large-v2" "large-v2-q4_0" "large-v2-q4_1" "large-v2-q5_0" "large-v2-q5_1" "large-v2-q8_0" "large-v2-dis" \
) )
if [ "$encoder_only" -eq 0 ]; then if [ "$encoder_only" -eq 0 ]; then
@ -44,8 +44,8 @@ if [ "$encoder_only" -eq 0 ]; then
printf "\n" printf "\n"
fi fi
printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "PP" "Commit" printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "Bch5" "PP" "Commit"
printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" "---"
for model in "${models[@]}"; do for model in "${models[@]}"; do
# actual run # actual run
@ -56,6 +56,7 @@ for model in "${models[@]}"; do
# parse the output: # parse the output:
encode_time=$(echo "$output" | grep "encode time" | awk '{print $11}') encode_time=$(echo "$output" | grep "encode time" | awk '{print $11}')
decode_time=$(echo "$output" | grep "decode time" | awk '{print $11}') decode_time=$(echo "$output" | grep "decode time" | awk '{print $11}')
batchd_time=$(echo "$output" | grep "batchd time" | awk '{print $11}')
prompt_time=$(echo "$output" | grep "prompt time" | awk '{print $11}') prompt_time=$(echo "$output" | grep "prompt time" | awk '{print $11}')
system_info=$(echo "$output" | grep "system_info") system_info=$(echo "$output" | grep "system_info")
n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}') n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
@ -94,6 +95,6 @@ for model in "${models[@]}"; do
commit=$(git rev-parse --short HEAD) commit=$(git rev-parse --short HEAD)
if [ $ret -eq 0 ]; then if [ $ret -eq 0 ]; then
printf "| <todo> | <todo> | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$prompt_time" "$commit" printf "| <todo> | <todo> | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit"
fi fi
done done

View File

@ -1,6 +1,6 @@
#!/bin/bash #!/bin/bash
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
for model in "${models[@]}"; do for model in "${models[@]}"; do
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/ python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/

View File

@ -446,12 +446,14 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)]; return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
} }
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) { static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
ggml_tallocr_t alloc = node_tallocr(galloc, view); ggml_tallocr_t alloc = node_tallocr(galloc, view);
//printf("init_view: %s from src %s\n", view->name, view->view_src->name); //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL); GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
view->backend = view->view_src->backend; if (update_backend) {
view->backend = view->view_src->backend;
}
view->buffer = view->view_src->buffer; view->buffer = view->view_src->buffer;
view->data = (char *)view->view_src->data + view->view_offs; view->data = (char *)view->view_src->data + view->view_offs;
@ -469,7 +471,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
if (node->data == NULL) { if (node->data == NULL) {
if (ggml_is_view(node)) { if (ggml_is_view(node)) {
init_view(galloc, node); init_view(galloc, node, true);
} else { } else {
// see if we can reuse a parent's buffer (inplace) // see if we can reuse a parent's buffer (inplace)
if (ggml_op_can_inplace(node->op)) { if (ggml_op_can_inplace(node->op)) {
@ -499,15 +501,14 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
node->view_src = view_src; node->view_src = view_src;
view_src_hn->n_views += 1; view_src_hn->n_views += 1;
init_view(galloc, node); init_view(galloc, node, false);
return; return;
} }
} } else {
else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
node->view_src = parent; node->view_src = parent;
p_hn->n_views += 1; p_hn->n_views += 1;
init_view(galloc, node); init_view(galloc, node, false);
return; return;
} }
} }
@ -537,7 +538,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
hash_get(galloc, view_src)->n_views += 1; hash_get(galloc, view_src)->n_views += 1;
if (node->buffer == NULL && node->data != NULL) { if (node->buffer == NULL && node->data != NULL) {
// view of a pre-allocated tensor, didn't call init_view() yet // view of a pre-allocated tensor, didn't call init_view() yet
init_view(galloc, node); init_view(galloc, node, true);
} }
} }
@ -548,7 +549,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
} }
hash_get(galloc, parent)->n_children += 1; hash_get(galloc, parent)->n_children += 1;
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
init_view(galloc, parent); init_view(galloc, parent, true);
} }
} }
} }
@ -663,7 +664,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
return max_size; return max_size;
} }
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) { void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
const size_t hash_size = hash_set.size; const size_t hash_size = hash_set.size;
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs)); GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
@ -686,7 +687,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
// reset hash values // reset hash values
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
galloc->hash_allocs = hash_node_alloct; galloc->hash_allocs = hash_node_talloc;
ggml_tallocr_alloc_graph_impl(galloc, graph); ggml_tallocr_alloc_graph_impl(galloc, graph);

View File

@ -39,7 +39,6 @@
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess #define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess #define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
#define cudaDeviceGetMemPool hipDeviceGetMemPool
#define cudaDeviceProp hipDeviceProp_t #define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize #define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t #define cudaError_t hipError_t
@ -49,7 +48,6 @@
#define cudaEvent_t hipEvent_t #define cudaEvent_t hipEvent_t
#define cudaEventDestroy hipEventDestroy #define cudaEventDestroy hipEventDestroy
#define cudaFree hipFree #define cudaFree hipFree
#define cudaFreeAsync hipFreeAsync
#define cudaFreeHost hipHostFree #define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice #define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount #define cudaGetDeviceCount hipGetDeviceCount
@ -57,7 +55,6 @@
#define cudaGetErrorString hipGetErrorString #define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError #define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc #define cudaMalloc hipMalloc
#define cudaMallocFromPoolAsync hipMallocFromPoolAsync
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
#define cudaMemcpy hipMemcpy #define cudaMemcpy hipMemcpy
#define cudaMemcpy2DAsync hipMemcpy2DAsync #define cudaMemcpy2DAsync hipMemcpy2DAsync
@ -66,9 +63,6 @@
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind #define cudaMemcpyKind hipMemcpyKind
#define cudaMemPool_t hipMemPool_t
#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
#define cudaMemPoolSetAttribute hipMemPoolSetAttribute
#define cudaMemset hipMemset #define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync #define cudaMemsetAsync hipMemsetAsync
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
@ -94,6 +88,8 @@
#define CC_OFFSET_AMD 1000000 #define CC_OFFSET_AMD 1000000
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) #define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define GGML_CUDA_MAX_NODES 8192
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM: // for large computational tasks. the drawback is that this requires some extra amount of VRAM:
@ -188,11 +184,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
do { \ do { \
cudaError_t err_ = (err); \ cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \ if (err_ != cudaSuccess) { \
int dev_id; \ int id; \
cudaGetDevice(&dev_id); \ cudaGetDevice(&id); \
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \ fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \ cudaGetErrorString(err_)); \
fprintf(stderr, "current device: %d\n", dev_id); \ fprintf(stderr, "current device: %d\n", id); \
exit(1); \ exit(1); \
} \ } \
} while (0) } while (0)
@ -202,11 +198,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
do { \ do { \
cublasStatus_t err_ = (err); \ cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \ if (err_ != CUBLAS_STATUS_SUCCESS) { \
int dev_id; \ int id; \
cudaGetDevice(&dev_id); \ cudaGetDevice(&id); \
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \ fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \ err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
fprintf(stderr, "current device: %d\n", dev_id); \ fprintf(stderr, "current device: %d\n", id); \
exit(1); \ exit(1); \
} \ } \
} while (0) } while (0)
@ -440,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
#define CUDA_MUL_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256
#define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_GELU_BLOCK_SIZE 256
#define CUDA_SILU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256
#define CUDA_RELU_BLOCK_SIZE 256
#define CUDA_SQR_BLOCK_SIZE 256
#define CUDA_CPY_BLOCK_SIZE 32 #define CUDA_CPY_BLOCK_SIZE 32
#define CUDA_SCALE_BLOCK_SIZE 256 #define CUDA_SCALE_BLOCK_SIZE 256
#define CUDA_CLAMP_BLOCK_SIZE 256 #define CUDA_CLAMP_BLOCK_SIZE 256
@ -472,7 +470,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
#define MAX_STREAMS 8 #define MAX_STREAMS 8
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr }; static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
struct ggml_tensor_extra_gpu { struct ggml_tensor_extra_gpu {
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
@ -561,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
dst[i] = x[i] / (1.0f + expf(-x[i])); dst[i] = x[i] / (1.0f + expf(-x[i]));
} }
static __global__ void relu_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = fmaxf(x[i], 0);
}
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = x[i] * x[i];
}
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll #pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) { for (int mask = 16; mask > 0; mask >>= 1) {
@ -990,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return; if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K; const int num_blocks_per_row = ncols / QK_K;
@ -1094,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return; if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K; const int num_blocks_per_row = ncols / QK_K;
@ -1198,7 +1213,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return; if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K; const int num_blocks_per_row = ncols / QK_K;
const int ib0 = row*num_blocks_per_row; const int ib0 = row*num_blocks_per_row;
@ -1452,7 +1467,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row > nrows) return; if (row > nrows) return;
const int num_blocks_per_row = ncols / QK_K; const int num_blocks_per_row = ncols / QK_K;
@ -4262,7 +4277,7 @@ template <bool need_check> static __global__ void
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda> template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) { static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row >= nrows) { if (row >= nrows) {
return; return;
@ -4302,7 +4317,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
// qk = quantized weights per x block // qk = quantized weights per x block
// qr = number of quantized weights per data value in x block // qr = number of quantized weights per data value in x block
const int row = blockIdx.y*blockDim.y + threadIdx.y; const int row = blockIdx.x*blockDim.y + threadIdx.y;
if (row >= nrows) { if (row >= nrows) {
return; return;
@ -4741,7 +4756,7 @@ static __global__ void im2col_f32_f16(
int ofs0, int ofs1, int IW, int IH, int CHW, int ofs0, int ofs1, int IW, int IH, int CHW,
int s0, int s1, int p0, int p1, int d0, int d1) { int s0, int s1, int p0, int p1, int d0, int d1) {
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0; const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1; const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
const int offset_dst = const int offset_dst =
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW + (threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
@ -4793,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k); silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % WARP_SIZE == 0); GGML_ASSERT(ncols % WARP_SIZE == 0);
if (ncols < 1024) { if (ncols < 1024) {
@ -4901,7 +4926,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0> dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@ -4910,7 +4936,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1> dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@ -4919,7 +4945,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0> dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@ -4928,7 +4954,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1> dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@ -4937,7 +4963,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0> dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@ -4947,7 +4973,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
const int block_num_y = (nrows + ny - 1) / ny; const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1); const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
} }
@ -4956,7 +4982,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION; const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny; const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1); const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
} }
@ -4965,7 +4991,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION; const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny; const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1); const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
} }
@ -4980,7 +5006,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int ny = 2 / K_QUANTS_PER_ITERATION; const int ny = 2 / K_QUANTS_PER_ITERATION;
const int block_num_y = (nrows + ny - 1) / ny; const int block_num_y = (nrows + ny - 1) / ny;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(32, ny, 1); const dim3 block_dims(32, ny, 1);
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
} }
@ -4988,7 +5014,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK4_0 == 0); GGML_ASSERT(ncols % QK4_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1> mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -4997,7 +5023,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK4_1 == 0); GGML_ASSERT(ncols % QK4_1 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1> mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5006,7 +5032,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK5_0 == 0); GGML_ASSERT(ncols % QK5_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1> mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5015,7 +5041,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK5_1 == 0); GGML_ASSERT(ncols % QK5_1 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1> mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5024,7 +5050,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK8_0 == 0); GGML_ASSERT(ncols % QK8_0 == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1> mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5033,7 +5059,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1> mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5042,7 +5068,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1> mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5051,7 +5077,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1> mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5060,7 +5086,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1> mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5069,7 +5095,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % QK_K == 0); GGML_ASSERT(ncols % QK_K == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1> mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
@ -5088,7 +5114,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
const dim3 block_nums(1, block_num_y, 1); const dim3 block_nums(block_num_y, 1, 1);
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
dequantize_mul_mat_vec<1, 1, convert_f16> dequantize_mul_mat_vec<1, 1, convert_f16>
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows); <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
@ -5825,16 +5851,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
return ptr; return ptr;
} }
static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) {
if (g_cudaMemPools[id] == nullptr) {
return ggml_cuda_pool_malloc(size, actual_size);
}
void *ptr;
CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream));
*actual_size = size;
return ptr;
}
static void ggml_cuda_pool_free(void * ptr, size_t size) { static void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock); scoped_spin_lock lock(g_cuda_pool_lock);
int id; int id;
@ -5852,12 +5868,10 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
CUDA_CHECK(cudaFree(ptr)); CUDA_CHECK(cudaFree(ptr));
} }
static bool g_cublas_loaded = false;
static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) { bool ggml_cublas_loaded(void) {
if (g_cudaMemPools[id] == nullptr) { return g_cublas_loaded;
return ggml_cuda_pool_free(ptr, actual_size);
}
CUDA_CHECK(cudaFreeAsync(ptr, stream));
} }
void ggml_init_cublas() { void ggml_init_cublas() {
@ -5872,7 +5886,12 @@ void ggml_init_cublas() {
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
#endif #endif
CUDA_CHECK(cudaGetDeviceCount(&g_device_count)); if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
initialized = true;
g_cublas_loaded = false;
return;
}
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0; int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ) #if defined(GGML_CUDA_FORCE_MMQ)
@ -5914,19 +5933,13 @@ void ggml_init_cublas() {
// create cublas handle // create cublas handle
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
// configure memory pool
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
if (err == cudaSuccess) {
size_t treshold = UINT64_MAX;
CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
}
} }
// configure logging to stdout // configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
initialized = true; initialized = true;
g_cublas_loaded = true;
} }
} }
@ -6129,6 +6142,9 @@ inline void ggml_cuda_op_add(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(ggml_is_contiguous(src1));
GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32);
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
@ -6193,6 +6209,34 @@ inline void ggml_cuda_op_silu(
(void) src1_dd; (void) src1_dd;
} }
inline void ggml_cuda_op_relu(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1;
(void) dst;
(void) src1_dd;
}
inline void ggml_cuda_op_sqr(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
(void) src1;
(void) dst;
(void) src1_dd;
}
inline void ggml_cuda_op_norm( inline void ggml_cuda_op_norm(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) { const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
@ -6514,7 +6558,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
GGML_ASSERT(to_fp16_cuda != nullptr); GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = row_diff*ne00; size_t ne = row_diff*ne00;
src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream); src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream); to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
} }
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16; const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
@ -6525,12 +6569,12 @@ inline void ggml_cuda_op_mul_mat_cublas(
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
GGML_ASSERT(to_fp16_cuda != nullptr); GGML_ASSERT(to_fp16_cuda != nullptr);
size_t ne = src1_ncols*ne10; size_t ne = src1_ncols*ne10;
src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream); src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream); to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
} }
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16; const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
size_t dst_f16_as = 0; size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream); half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
const half alpha_f16 = 1.0f; const half alpha_f16 = 1.0f;
const half beta_f16 = 0.0f; const half beta_f16 = 0.0f;
@ -6548,15 +6592,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream); to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
if (dst_f16_as != 0) { ggml_cuda_pool_free(dst_f16, dst_as);
ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream);
}
if (src0_as != 0) { if (src0_as != 0) {
ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream); ggml_cuda_pool_free(src0_as_f16, src0_as);
} }
if (src1_as != 0) { if (src1_as != 0) {
ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream); ggml_cuda_pool_free(src1_as_f16, src1_as);
} }
} }
else { else {
@ -6566,7 +6609,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
if (src0->type != GGML_TYPE_F32) { if (src0->type != GGML_TYPE_F32) {
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
GGML_ASSERT(to_fp32_cuda != nullptr); GGML_ASSERT(to_fp32_cuda != nullptr);
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream); to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
} }
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32; const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
@ -6583,7 +6626,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
&beta, dst_dd_i, ldc)); &beta, dst_dd_i, ldc));
if (src0_as != 0) { if (src0_as != 0) {
ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream); ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
} }
} }
@ -7008,6 +7051,8 @@ static void ggml_cuda_op_mul_mat(
int64_t row_low[GGML_CUDA_MAX_DEVICES]; int64_t row_low[GGML_CUDA_MAX_DEVICES];
int64_t row_high[GGML_CUDA_MAX_DEVICES]; int64_t row_high[GGML_CUDA_MAX_DEVICES];
int used_devices = 0;
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
// by default, use all rows // by default, use all rows
row_low[id] = 0; row_low[id] = 0;
@ -7035,6 +7080,8 @@ static void ggml_cuda_op_mul_mat(
continue; continue;
} }
used_devices++;
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device; const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device; const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
@ -7045,22 +7092,21 @@ static void ggml_cuda_op_mul_mat(
src0_dd[id] = (char *) src0_extra->data_device[id]; src0_dd[id] = (char *) src0_extra->data_device[id];
} else { } else {
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream); src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
} }
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
src1_ddf[id] = (float *) src1_extra->data_device[id]; src1_ddf[id] = (float *) src1_extra->data_device[id];
} else { } else {
src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream); src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
} }
if (convert_src1_to_q8_1) { if (convert_src1_to_q8_1) {
const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs; src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream);
if (src1_on_device && src1_is_contiguous) { if (src1_on_device && src1_is_contiguous) {
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
// CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
} }
} }
@ -7068,18 +7114,18 @@ static void ggml_cuda_op_mul_mat(
dst_dd[id] = (float *) dst_extra->data_device[id]; dst_dd[id] = (float *) dst_extra->data_device[id];
} else { } else {
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id, stream); dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
} }
} }
// if multiple devices are used they need to wait for the main device // if multiple devices are used they need to wait for the main device
// here an event is recorded that signals that the main device has finished calculating the input data // here an event is recorded that signals that the main device has finished calculating the input data
if (split && g_device_count > 1) { if (split && used_devices > 1) {
CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
} }
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
@ -7194,6 +7240,27 @@ static void ggml_cuda_op_mul_mat(
} }
} }
for (int64_t id = 0; id < g_device_count; ++id) {
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
continue;
}
CUDA_CHECK(ggml_cuda_set_device(id));
// free buffers again when done
if (src0_as[id] > 0) {
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
}
if (src1_asf[id] > 0) {
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
}
if (src1_asq[id] > 0) {
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
}
if (dst_as[id] > 0) {
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
}
}
// main device waits for all other devices to be finished // main device waits for all other devices to be finished
if (split && g_device_count > 1) { if (split && g_device_count > 1) {
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
@ -7201,6 +7268,9 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(ggml_cuda_set_device(g_main_device));
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
if (row_low[id] == row_high[id]) {
continue;
}
for (int64_t is = 0; is < is_max; ++is) { for (int64_t is = 0; is < is_max; ++is) {
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0)); CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
} }
@ -7211,21 +7281,6 @@ static void ggml_cuda_op_mul_mat(
CUDA_CHECK(ggml_cuda_set_device(g_main_device)); CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaDeviceSynchronize()); CUDA_CHECK(cudaDeviceSynchronize());
} }
for (int64_t id = 0; id < g_device_count; ++id) {
if (src0_as[id] > 0) {
ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]);
}
if (src1_asf[id] > 0) {
ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]);
}
if (src1_asq[id] > 0) {
ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]);
}
if (dst_as[id] > 0) {
ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]);
}
}
} }
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@ -7252,6 +7307,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
} }
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
}
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
}
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
} }
@ -7261,6 +7324,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
} }
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
if (!g_cublas_loaded) return false;
const int64_t ne10 = src1->ne[0]; const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0]; const int64_t ne0 = dst->ne[0];
@ -7412,11 +7477,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
GGML_ASSERT(to_fp16_cuda != nullptr); GGML_ASSERT(to_fp16_cuda != nullptr);
size_t src1_as = 0; size_t src1_as = 0;
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream); half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
size_t dst_as = 0; size_t dst_as = 0;
half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream); half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne12 % ne02 == 0);
GGML_ASSERT(ne13 % ne03 == 0); GGML_ASSERT(ne13 % ne03 == 0);
@ -7470,8 +7535,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
size_t ptrs_src_s = 0; size_t ptrs_src_s = 0;
size_t ptrs_dst_s = 0; size_t ptrs_dst_s = 0;
ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream); ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
ptrs_dst = ( void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream); ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
dim3 block_dims(ne13, ne12); dim3 block_dims(ne13, ne12);
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
@ -7484,6 +7549,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
dst->nb[2], dst->nb[3], dst->nb[2], dst->nb[3],
r2, r3); r2, r3);
CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaGetLastError());
CUBLAS_CHECK( CUBLAS_CHECK(
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10, ne01, ne11, ne10,
@ -7495,30 +7561,29 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
CUBLAS_GEMM_DEFAULT_TENSOR_OP)); CUBLAS_GEMM_DEFAULT_TENSOR_OP));
if (ptrs_src_s != 0) { if (ptrs_src_s != 0) {
ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream); ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
} }
if (ptrs_dst_s != 0) { if (ptrs_dst_s != 0) {
ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream); ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
} }
} }
#endif #endif
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
if (src1_as != 0) {
ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream); ggml_cuda_pool_free(src1_as_f16, src1_as);
} ggml_cuda_pool_free(dst_f16, dst_as);
if (dst_as != 0) {
ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream);
}
} }
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool all_on_device = const bool all_on_device =
(src0->backend == GGML_BACKEND_GPU) && (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
(src1->backend == GGML_BACKEND_GPU) && (src1->backend == GGML_BACKEND_GPU) &&
( dst->backend == GGML_BACKEND_GPU); ( dst->backend == GGML_BACKEND_GPU);
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
int64_t min_compute_capability = INT_MAX; int64_t min_compute_capability = INT_MAX;
for (int64_t id = 0; id < g_device_count; ++id) { for (int64_t id = 0; id < g_device_count; ++id) {
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
@ -7540,13 +7605,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch // KQ single-batch
ggml_cuda_mul_mat_vec_p021(src0, src1, dst); ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch // KQV single-batch
ggml_cuda_mul_mat_vec_nc(src0, src1, dst); ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
// KQ + KQV multi-batch // KQ + KQV multi-batch
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
} else if (src0->type == GGML_TYPE_F32) { } else if (src0->type == GGML_TYPE_F32) {
@ -7667,7 +7732,7 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
} }
void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col); ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
} }
@ -7782,11 +7847,11 @@ static size_t g_temp_tensor_extra_index = 0;
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (g_temp_tensor_extras == nullptr) { if (g_temp_tensor_extras == nullptr) {
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
} }
size_t alloc_index = g_temp_tensor_extra_index; size_t alloc_index = g_temp_tensor_extra_index;
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra)); memset(extra, 0, sizeof(*extra));
@ -7953,6 +8018,8 @@ void ggml_cuda_free_scratch() {
} }
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
if (!g_cublas_loaded) return false;
ggml_cuda_func_t func; ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
@ -7995,6 +8062,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
func = ggml_cuda_silu; func = ggml_cuda_silu;
break; break;
case GGML_UNARY_OP_RELU:
func = ggml_cuda_relu;
break;
default: default:
return false; return false;
} break; } break;
@ -8013,6 +8083,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
case GGML_OP_SCALE: case GGML_OP_SCALE:
func = ggml_cuda_scale; func = ggml_cuda_scale;
break; break;
case GGML_OP_SQR:
func = ggml_cuda_sqr;
break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
if (!any_on_device) { if (!any_on_device) {
return false; return false;
@ -8105,11 +8178,11 @@ struct ggml_backend_buffer_context_cuda {
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (temp_tensor_extras == nullptr) { if (temp_tensor_extras == nullptr) {
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE]; temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
} }
size_t alloc_index = temp_tensor_extra_index; size_t alloc_index = temp_tensor_extra_index;
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE; temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra)); memset(extra, 0, sizeof(*extra));

View File

@ -17,7 +17,12 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16 #define GGML_CUDA_MAX_DEVICES 16
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
GGML_API void ggml_init_cublas(void); GGML_API void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
GGML_API bool ggml_cublas_loaded(void);
GGML_API void * ggml_cuda_host_malloc(size_t size); GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr); GGML_API void ggml_cuda_host_free(void * ptr);

View File

@ -346,9 +346,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
} }
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false"); GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
if (ctx->device.maxTransferRate != 0) { if (ctx->device.maxTransferRate != 0) {
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
} else { } else {
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__); GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
} }
@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);
++ctx->n_buffers; ++ctx->n_buffers;
} else { } else {
@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil]; ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
if (ctx->buffers[ctx->n_buffers].metal == nil) { if (ctx->buffers[ctx->n_buffers].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0); GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
return false; return false;
} }
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i); GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
if (i + size_step < size) { if (i + size_step < size) {
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }
@ -580,8 +580,8 @@ bool ggml_metal_add_buffer(
#if TARGET_OS_OSX #if TARGET_OS_OSX
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
ctx->device.currentAllocatedSize / 1024.0 / 1024.0, ctx->device.currentAllocatedSize / 1e6,
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); ctx->device.recommendedMaxWorkingSetSize / 1e6);
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) { if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__); GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
@ -589,7 +589,7 @@ bool ggml_metal_add_buffer(
GGML_METAL_LOG_INFO("\n"); GGML_METAL_LOG_INFO("\n");
} }
#else #else
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0); GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
#endif #endif
} }

View File

@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
float max = x[0]; float max = x[0];
float sum_w = weights[0]; float sum_w = weights[0];
float sum_x = sum_w * x[0]; float sum_x = sum_w * x[0];
#ifdef HAVE_BUGGY_APPLE_LINKER
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
for (volatile int i = 1; i < n; ++i) {
#else
for (int i = 1; i < n; ++i) { for (int i = 1; i < n; ++i) {
#endif
if (x[i] < min) min = x[i]; if (x[i] < min) min = x[i];
if (x[i] > max) max = x[i]; if (x[i] > max) max = x[i];
float w = weights[i]; float w = weights[i];

341
ggml.c
View File

@ -5024,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
int n_orig_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow,
float xpos_base, float xpos_base,
bool xpos_down) { bool xpos_down) {
GGML_ASSERT(ggml_is_vector(b)); GGML_ASSERT(ggml_is_vector(b));
@ -5042,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
struct ggml_tensor * result = ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx }; int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
memcpy(params + 4, &freq_base, sizeof(float)); memcpy(params + 5, &freq_base, sizeof(float));
memcpy(params + 5, &freq_scale, sizeof(float)); memcpy(params + 6, &freq_scale, sizeof(float));
memcpy(params + 6, &xpos_base, sizeof(float)); memcpy(params + 7, &ext_factor, sizeof(float));
memcpy(params + 7, &xpos_down, sizeof(bool)); memcpy(params + 8, &attn_factor, sizeof(float));
memcpy(params + 9, &beta_fast, sizeof(float));
memcpy(params + 10, &beta_slow, sizeof(float));
memcpy(params + 11, &xpos_base, sizeof(float));
memcpy(params + 12, &xpos_down, sizeof(bool));
ggml_set_op_params(result, params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_ROPE_BACK; result->op = GGML_OP_ROPE_BACK;
@ -9376,7 +9385,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
} }
#endif #endif
static void ggml_compute_forward_mul_mat( static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
@ -10946,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst,
const bool forward) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
@ -11005,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
// backward process uses inverse rotation by cos and sin.
// cos and sin build a rotation matrix, where the inverse is the transpose.
// this essentially just switches the sign of sin.
const float sin_sign = forward ? 1.0f : -1.0f;
const int32_t * pos = (const int32_t *) src1->data; const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11021,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
float block_theta = MAX(p - (n_ctx - 2), 0); float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta_base); const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base); const float sin_theta = sinf(theta_base) * sin_sign;
const float cos_block_theta = cosf(block_theta); const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta); const float sin_block_theta = sinf(block_theta) * sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
block_theta *= theta_scale; block_theta *= theta_scale;
@ -11047,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
rope_yarn( rope_yarn(
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
// zeta scaling for xPos only: // zeta scaling for xPos only:
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f; float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@ -11077,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
&cos_theta, &sin_theta &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
@ -11102,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
const struct ggml_tensor * src1, const struct ggml_tensor * src1,
struct ggml_tensor * dst) { struct ggml_tensor * dst,
const bool forward) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return; return;
} }
@ -11154,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
const bool is_neox = mode & 2; const bool is_neox = mode & 2;
const bool is_glm = mode & 4; const bool is_glm = mode & 4;
// backward process uses inverse rotation by cos and sin.
// cos and sin build a rotation matrix, where the inverse is the transpose.
// this essentially just switches the sign of sin.
const float sin_sign = forward ? 1.0f : -1.0f;
const int32_t * pos = (const int32_t *) src1->data; const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) { for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11170,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
float block_theta = MAX(p - (n_ctx - 2), 0); float block_theta = MAX(p - (n_ctx - 2), 0);
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) { for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
const float cos_theta = cosf(theta_base); const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base); const float sin_theta = sinf(theta_base) * sin_sign;
const float cos_block_theta = cosf(block_theta); const float cos_block_theta = cosf(block_theta);
const float sin_block_theta = sinf(block_theta); const float sin_block_theta = sinf(block_theta) * sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
block_theta *= theta_scale; block_theta *= theta_scale;
@ -11196,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
rope_yarn( rope_yarn(
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
@ -11222,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
&cos_theta, &sin_theta &cos_theta, &sin_theta
); );
sin_theta *= sin_sign;
theta_base *= theta_scale; theta_base *= theta_scale;
@ -11251,11 +11275,11 @@ static void ggml_compute_forward_rope(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
ggml_compute_forward_rope_f16(params, src0, src1, dst); ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
} break; } break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
ggml_compute_forward_rope_f32(params, src0, src1, dst); ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
} break; } break;
default: default:
{ {
@ -11266,216 +11290,6 @@ static void ggml_compute_forward_rope(
// ggml_compute_forward_rope_back // ggml_compute_forward_rope_back
static void ggml_compute_forward_rope_back_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
// y = rope(x, src1)
// dx = rope_back(dy, src1)
// src0 is dy, src1 contains options
float freq_base;
float freq_scale;
// these two only relevant for xPos RoPE:
float xpos_base;
bool xpos_down;
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
assert(nb0 == sizeof(float));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(dst);
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
// row index used to determine which thread to use
int ir = 0;
const float theta_scale = powf(freq_base, -2.0f/n_dims);
const bool is_neox = mode & 2;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
float theta_base = freq_scale * (float)p;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
// zeta scaling for xPos only:
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
if (xpos_down) zeta = 1.0f / zeta;
theta_base *= theta_scale;
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = dy[0];
const float dy1 = dy[1];
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
theta_base *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = dy[0];
const float dy1 = dy[n_dims/2];
dx[0] = dy0*cos_theta + dy1*sin_theta;
dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
}
}
}
}
}
}
}
static void ggml_compute_forward_rope_back_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
return;
}
// y = rope(x, src1)
// dx = rope_back(dy, src1)
// src0 is dy, src1 contains options
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_dims = ((int32_t *) dst->op_params)[1];
const int mode = ((int32_t *) dst->op_params)[2];
GGML_TENSOR_UNARY_OP_LOCALS
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
assert(nb0 == sizeof(ggml_fp16_t));
const int ith = params->ith;
const int nth = params->nth;
const int nr = ggml_nrows(dst);
// rows per thread
const int dr = (nr + nth - 1)/nth;
// row range for this thread
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
// row index used to determine which thread to use
int ir = 0;
const float theta_scale = powf(10000.0, -2.0f/n_dims);
const bool is_neox = mode & 2;
const int32_t * pos = (const int32_t *) src1->data;
for (int64_t i3 = 0; i3 < ne3; i3++) {
for (int64_t i2 = 0; i2 < ne2; i2++) {
const int64_t p = pos[i2];
for (int64_t i1 = 0; i1 < ne1; i1++) {
if (ir++ < ir0) continue;
if (ir > ir1) break;
float theta_base = (float)p;
if (!is_neox) {
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
theta_base *= theta_scale;
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
const float dy1 = GGML_FP16_TO_FP32(dy[1]);
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
}
} else {
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
for (int64_t ic = 0; ic < n_dims; ic += 2) {
const float cos_theta = cosf(theta_base);
const float sin_theta = sinf(theta_base);
theta_base *= theta_scale;
const int64_t i0 = ib*n_dims + ic/2;
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
}
}
}
}
}
}
}
static void ggml_compute_forward_rope_back( static void ggml_compute_forward_rope_back(
const struct ggml_compute_params * params, const struct ggml_compute_params * params,
const struct ggml_tensor * src0, const struct ggml_tensor * src0,
@ -11484,11 +11298,11 @@ static void ggml_compute_forward_rope_back(
switch (src0->type) { switch (src0->type) {
case GGML_TYPE_F16: case GGML_TYPE_F16:
{ {
ggml_compute_forward_rope_back_f16(params, src0, src1, dst); ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
} break; } break;
case GGML_TYPE_F32: case GGML_TYPE_F32:
{ {
ggml_compute_forward_rope_back_f32(params, src0, src1, dst); ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
} break; } break;
default: default:
{ {
@ -14923,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
// necessary for llama // necessary for llama
if (src0->grad) { if (src0->grad) {
//const int n_past = ((int32_t *) tensor->op_params)[0]; //const int n_past = ((int32_t *) tensor->op_params)[0];
const int n_dims = ((int32_t *) tensor->op_params)[1]; const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2]; const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3]; const int n_ctx = ((int32_t *) tensor->op_params)[3];
float freq_base; const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
float freq_scale; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
float xpos_base;
bool xpos_down; memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
src0->grad = ggml_add_or_set(ctx, src0->grad = ggml_add_or_set(ctx,
src0->grad, src0->grad,
@ -14943,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
n_dims, n_dims,
mode, mode,
n_ctx, n_ctx,
n_orig_ctx,
freq_base, freq_base,
freq_scale, freq_scale,
ext_factor,
attn_factor,
beta_fast,
beta_slow,
xpos_base, xpos_base,
xpos_down), xpos_down),
zero_table); zero_table);
@ -14954,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
{ {
if (src0->grad) { if (src0->grad) {
//const int n_past = ((int32_t *) tensor->op_params)[0]; //const int n_past = ((int32_t *) tensor->op_params)[0];
const int n_dims = ((int32_t *) tensor->op_params)[1]; const int n_dims = ((int32_t *) tensor->op_params)[1];
const int mode = ((int32_t *) tensor->op_params)[2]; const int mode = ((int32_t *) tensor->op_params)[2];
const int n_ctx = ((int32_t *) tensor->op_params)[3]; const int n_ctx = ((int32_t *) tensor->op_params)[3];
float freq_base; const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
float freq_scale; float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
float xpos_base;
bool xpos_down; memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float)); memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float)); memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float)); memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool)); memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
src0->grad = ggml_add_or_set(ctx, src0->grad = ggml_add_or_set(ctx,
src0->grad, src0->grad,
@ -14973,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
src1, src1,
n_dims, n_dims,
mode, mode,
0,
n_ctx, n_ctx,
n_orig_ctx,
freq_base, freq_base,
freq_scale, freq_scale,
0.0f, ext_factor,
1.0f, attn_factor,
0.0f, beta_fast,
0.0f, beta_slow,
xpos_base, xpos_base,
xpos_down, xpos_down,
false), false),
@ -18248,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
{ {
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv)); ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) { for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
struct gguf_kv * kv = &ctx->kv[i]; struct gguf_kv * kv = &ctx->kv[i];
//fprintf(stderr, "%s: reading kv %d\n", __func__, i); //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@ -18295,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
case GGUF_TYPE_STRING: case GGUF_TYPE_STRING:
{ {
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str)); kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
for (uint32_t j = 0; j < kv->value.arr.n; ++j) { for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
} }
} break; } break;
@ -18323,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
{ {
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i]; struct gguf_tensor_info * info = &ctx->infos[i];
for (int j = 0; j < GGML_MAX_DIMS; ++j) { for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@ -18370,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
// compute the total size of the data section, taking into account the alignment // compute the total size of the data section, taking into account the alignment
{ {
ctx->size = 0; ctx->size = 0;
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
struct gguf_tensor_info * info = &ctx->infos[i]; struct gguf_tensor_info * info = &ctx->infos[i];
const int64_t ne = const int64_t ne =
@ -18439,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
ggml_set_no_alloc(ctx_data, true); ggml_set_no_alloc(ctx_data, true);
// create the tensors // create the tensors
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) { for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
const int64_t ne[GGML_MAX_DIMS] = { const int64_t ne[GGML_MAX_DIMS] = {
ctx->infos[i].ne[0], ctx->infos[i].ne[0],
ctx->infos[i].ne[1], ctx->infos[i].ne[1],

5
ggml.h
View File

@ -1371,8 +1371,13 @@ extern "C" {
int n_dims, int n_dims,
int mode, int mode,
int n_ctx, int n_ctx,
int n_orig_ctx,
float freq_base, float freq_base,
float freq_scale, float freq_scale,
float ext_factor,
float attn_factor,
float beta_fast,
float beta_slow,
float xpos_base, float xpos_base,
bool xpos_down); bool xpos_down);

View File

@ -39,19 +39,19 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
## Available models ## Available models
| Model | Disk | Mem | SHA | | Model | Disk | SHA |
| --- | --- | --- | --- | | --- | --- | --- |
| tiny | 75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` | | tiny | 75 MiB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
| tiny.en | 75 MB | ~390 MB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` | | tiny.en | 75 MiB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` | | base | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
| base.en | 142 MB | ~500 MB | `137c40403d78fd54d454da0f9bd998f78703390c` | | base.en | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` | | small | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
| small.en | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` | | small.en | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` | | medium | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` | | medium.en | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
| large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` | | large-v1 | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
| large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` | | large-v2 | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
| large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` | | large-v3 | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
## Model files for testing purposes ## Model files for testing purposes
@ -76,3 +76,27 @@ git clone https://huggingface.co/openai/whisper-medium
# convert the model to ggml # convert the model to ggml
python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper . python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
``` ```
## Distilled models
Initial support for https://huggingface.co/distil-whisper is available.
Currently, the chunk-based transcription strategy is not implemented, so there can be sub-optimal quality when using the distilled models with `whisper.cpp`.
```bash
# clone OpenAI whisper and whisper.cpp
git clone https://github.com/openai/whisper
git clone https://github.com/ggerganov/whisper.cpp
# get the models
cd whisper.cpp/models
git clone https://huggingface.co/distil-whisper/distil-medium.en
git clone https://huggingface.co/distil-whisper/distil-large-v2
# convert to ggml
python3 ./convert-h5-to-ggml.py ./distil-medium.en/ ../../whisper .
mv ggml-model.bin ggml-medium.en-distil.bin
python3 ./convert-h5-to-ggml.py ./distil-large-v2/ ../../whisper .
mv ggml-model.bin ggml-large-v2-distil.bin
```

View File

@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
# Ported from models/convert-whisper-to-coreml.py # Ported from models/convert-whisper-to-coreml.py
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True) parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True) parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False) parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False) parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False) parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args() args = parser.parse_args()
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]: if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
raise ValueError("Invalid model name") raise ValueError("Invalid model name")
pt_target_path = f"models/hf-{args.model_name}.pt" pt_target_path = f"models/hf-{args.model_name}.pt"

View File

@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True) parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False) parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False) parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False) parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
args = parser.parse_args() args = parser.parse_args()
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large", "large-v1", "large-v2"]: if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
raise ValueError("Invalid model name") raise ValueError("Invalid model name")
whisper = load_model(args.model).cpu() whisper = load_model(args.model).cpu()

View File

@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True) parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
args = parser.parse_args() args = parser.parse_args()
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]: if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
raise ValueError("Invalid model name") raise ValueError("Invalid model name")
whisper = load_model(args.model).cpu() whisper = load_model(args.model).cpu()

View File

@ -19,7 +19,7 @@ function get_script_path() {
models_path="$(get_script_path)" models_path="$(get_script_path)"
# Whisper models # Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
# list available models # list available models
function list_models { function list_models {

View File

@ -8,7 +8,7 @@ popd
set argc=0 set argc=0
for %%x in (%*) do set /A argc+=1 for %%x in (%*) do set /A argc+=1
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3
if %argc% neq 1 ( if %argc% neq 1 (
echo. echo.

View File

@ -42,7 +42,7 @@ models=(
"medium.en-q5_0" "medium.en-q5_0"
"large-v1" "large-v1"
"large-v2" "large-v2"
"large" "large-v3"
"large-q5_0" "large-q5_0"
) )

View File

@ -19,7 +19,7 @@
cd `dirname $0` cd `dirname $0`
# Whisper models # Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" ) models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
# list available models # list available models
function list_models { function list_models {

File diff suppressed because it is too large Load Diff

View File

@ -50,7 +50,9 @@ extern "C" {
// //
// ... // ...
// //
// struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin"); // whisper_context_params cparams = whisper_context_default_params();
//
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
// //
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) { // if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
// fprintf(stderr, "failed to process audio\n"); // fprintf(stderr, "failed to process audio\n");
@ -78,7 +80,9 @@ extern "C" {
struct whisper_state; struct whisper_state;
struct whisper_full_params; struct whisper_full_params;
typedef int whisper_token; typedef int32_t whisper_pos;
typedef int32_t whisper_token;
typedef int32_t whisper_seq_id;
struct whisper_context_params { struct whisper_context_params {
bool use_gpu; bool use_gpu;