mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-04 08:20:57 +02:00
Compare commits
23 Commits
batched
...
bench-memc
Author | SHA1 | Date | |
---|---|---|---|
ee2971bf6a | |||
eff3570f78 | |||
fa19bc4195 | |||
a01b2e0971 | |||
8159a9ab99 | |||
7516d9c16d | |||
46cc26d1b9 | |||
f784f9fa12 | |||
ca23f8ee6d | |||
e2f0eba2d4 | |||
d4353e48f7 | |||
bebf0da983 | |||
848e54f3ad | |||
7883d1cae4 | |||
ccc85b4ff8 | |||
c7606b47df | |||
d38af151a1 | |||
94267df08e | |||
8713c67133 | |||
57a60639bb | |||
bfbaa4dce5 | |||
1d79e78402 | |||
b6c5f49b78 |
7
.github/workflows/build.yml
vendored
7
.github/workflows/build.yml
vendored
@ -320,6 +320,13 @@ jobs:
|
|||||||
cd ./build
|
cd ./build
|
||||||
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||||
|
|
||||||
|
- name: Copy CUDA DLLs
|
||||||
|
run: >
|
||||||
|
Copy-Item -PassThru
|
||||||
|
-Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
|
||||||
|
-Include cudart64_*,cublas64_*,cublasLt64_*
|
||||||
|
-Destination build/bin/${{ matrix.build }}
|
||||||
|
|
||||||
- name: Copy SDL2.dll
|
- name: Copy SDL2.dll
|
||||||
if: matrix.sdl2 == 'ON'
|
if: matrix.sdl2 == 'ON'
|
||||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -31,6 +31,7 @@ build-sanitize-thread/
|
|||||||
/talk-llama
|
/talk-llama
|
||||||
/bench
|
/bench
|
||||||
/quantize
|
/quantize
|
||||||
|
/server
|
||||||
/lsp
|
/lsp
|
||||||
|
|
||||||
arm_neon.h
|
arm_neon.h
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
cmake_minimum_required (VERSION 3.5)
|
cmake_minimum_required (VERSION 3.5)
|
||||||
|
|
||||||
project(whisper.cpp VERSION 1.4.3)
|
project(whisper.cpp VERSION 1.5.0)
|
||||||
|
|
||||||
# Add path to modules
|
# Add path to modules
|
||||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||||
|
11
Makefile
11
Makefile
@ -1,4 +1,4 @@
|
|||||||
default: main bench quantize
|
default: main bench quantize server
|
||||||
|
|
||||||
ifndef UNAME_S
|
ifndef UNAME_S
|
||||||
UNAME_S := $(shell uname -s)
|
UNAME_S := $(shell uname -s)
|
||||||
@ -338,7 +338,7 @@ libwhisper.so: $(WHISPER_OBJ)
|
|||||||
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so
|
rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so
|
||||||
|
|
||||||
#
|
#
|
||||||
# Examples
|
# Examples
|
||||||
@ -359,6 +359,9 @@ bench: examples/bench/bench.cpp $(WHISPER_OBJ)
|
|||||||
quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
|
quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
|
||||||
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)
|
||||||
|
|
||||||
|
server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
|
||||||
|
$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
|
||||||
|
|
||||||
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
|
||||||
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
|
||||||
|
|
||||||
@ -418,9 +421,9 @@ samples:
|
|||||||
.PHONY: medium
|
.PHONY: medium
|
||||||
.PHONY: large-v1
|
.PHONY: large-v1
|
||||||
.PHONY: large-v2
|
.PHONY: large-v2
|
||||||
.PHONY: large
|
.PHONY: large-v3
|
||||||
|
|
||||||
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
|
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
|
||||||
bash ./models/download-ggml-model.sh $@
|
bash ./models/download-ggml-model.sh $@
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo "==============================================="
|
@echo "==============================================="
|
||||||
|
18
README.md
18
README.md
@ -6,7 +6,7 @@
|
|||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||||
|
|
||||||
Beta: [v1.4.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.3) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
Stable: [v1.5.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||||
|
|
||||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||||
|
|
||||||
@ -231,18 +231,18 @@ make medium.en
|
|||||||
make medium
|
make medium
|
||||||
make large-v1
|
make large-v1
|
||||||
make large-v2
|
make large-v2
|
||||||
make large
|
make large-v3
|
||||||
```
|
```
|
||||||
|
|
||||||
## Memory usage
|
## Memory usage
|
||||||
|
|
||||||
| Model | Disk | Mem | SHA |
|
| Model | Disk | Mem |
|
||||||
| --- | --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| tiny | 75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
|
| tiny | 75 MiB | ~273 MB |
|
||||||
| base | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
| base | 142 MiB | ~388 MB |
|
||||||
| small | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
| small | 466 MiB | ~852 MB |
|
||||||
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| medium | 1.5 GiB | ~2.1 GB |
|
||||||
| large | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
| large | 2.9 GiB | ~3.9 GB |
|
||||||
|
|
||||||
## Quantization
|
## Quantization
|
||||||
|
|
||||||
|
@ -24,7 +24,7 @@ const (
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
// The models which will be downloaded, if no model is specified as an argument
|
// The models which will be downloaded, if no model is specified as an argument
|
||||||
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
|
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
Submodule bindings/ios updated: 9752de4100...f5e5cf24ca
@ -45,7 +45,7 @@ class WhisperCppTest {
|
|||||||
assertEquals(16384, params.n_max_text_ctx);
|
assertEquals(16384, params.n_max_text_ctx);
|
||||||
assertFalse(params.translate);
|
assertFalse(params.translate);
|
||||||
assertEquals(0.01f, params.thold_pt);
|
assertEquals(0.01f, params.thold_pt);
|
||||||
assertEquals(2, params.beam_search.beam_size);
|
assertEquals(5, params.beam_search.beam_size);
|
||||||
assertEquals(-1.0f, params.beam_search.patience);
|
assertEquals(-1.0f, params.beam_search.patience);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ class WhisperCppTest {
|
|||||||
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
|
assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
|
||||||
assertNotEquals(0, params.n_threads);
|
assertNotEquals(0, params.n_threads);
|
||||||
assertEquals(16384, params.n_max_text_ctx);
|
assertEquals(16384, params.n_max_text_ctx);
|
||||||
assertEquals(2, params.greedy.best_of);
|
assertEquals(5, params.greedy.best_of);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "whisper.cpp",
|
"name": "whisper.cpp",
|
||||||
"version": "1.4.3",
|
"version": "1.5.0",
|
||||||
"description": "Whisper speech recognition",
|
"description": "Whisper speech recognition",
|
||||||
"main": "whisper.js",
|
"main": "whisper.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
File diff suppressed because one or more lines are too long
@ -65,6 +65,7 @@ elseif(CMAKE_JS_VERSION)
|
|||||||
else()
|
else()
|
||||||
add_subdirectory(main)
|
add_subdirectory(main)
|
||||||
add_subdirectory(stream)
|
add_subdirectory(stream)
|
||||||
|
add_subdirectory(server)
|
||||||
add_subdirectory(command)
|
add_subdirectory(command)
|
||||||
add_subdirectory(bench)
|
add_subdirectory(bench)
|
||||||
add_subdirectory(quantize)
|
add_subdirectory(quantize)
|
||||||
|
@ -81,7 +81,7 @@ int whisper_bench_full(const whisper_params & params) {
|
|||||||
}
|
}
|
||||||
// heat encoder
|
// heat encoder
|
||||||
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
|
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
|
||||||
fprintf(stderr, "error: failed to encode model: %d\n", ret);
|
fprintf(stderr, "error: failed to encode: %d\n", ret);
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -90,13 +90,13 @@ int whisper_bench_full(const whisper_params & params) {
|
|||||||
|
|
||||||
// prompt heat
|
// prompt heat
|
||||||
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
|
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
|
||||||
fprintf(stderr, "error: failed to encode model: %d\n", ret);
|
fprintf(stderr, "error: failed to decode: %d\n", ret);
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
// text-generation heat
|
// text-generation heat
|
||||||
if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
|
if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
|
||||||
fprintf(stderr, "error: failed to encode model: %d\n", ret);
|
fprintf(stderr, "error: failed to decode: %d\n", ret);
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -104,20 +104,30 @@ int whisper_bench_full(const whisper_params & params) {
|
|||||||
|
|
||||||
// actual run
|
// actual run
|
||||||
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
|
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
|
||||||
fprintf(stderr, "error: failed to encode model: %d\n", ret);
|
fprintf(stderr, "error: failed to encode: %d\n", ret);
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 16; i++) {
|
// text-generation
|
||||||
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
|
for (int i = 0; i < 256; i++) {
|
||||||
fprintf(stderr, "error: failed to encode model: %d\n", ret);
|
if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
|
||||||
|
fprintf(stderr, "error: failed to decode: %d\n", ret);
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < 256; i++) {
|
// batched decoding
|
||||||
if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
|
for (int i = 0; i < 64; i++) {
|
||||||
fprintf(stderr, "error: failed to encode model: %d\n", ret);
|
if (int ret = whisper_decode(ctx, tokens, 5, 0, params.n_threads) != 0) {
|
||||||
|
fprintf(stderr, "error: failed to decode: %d\n", ret);
|
||||||
|
return 4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prompt processing
|
||||||
|
for (int i = 0; i < 16; i++) {
|
||||||
|
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
|
||||||
|
fprintf(stderr, "error: failed to decode: %d\n", ret);
|
||||||
return 4;
|
return 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,11 @@ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
|
|||||||
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
|
||||||
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
|
||||||
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
|
||||||
|
{"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
|
||||||
|
{"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
|
||||||
|
{"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
|
||||||
|
{"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
|
||||||
|
{"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_print_ftypes(FILE * fp) {
|
void ggml_print_ftypes(FILE * fp) {
|
||||||
@ -48,15 +53,15 @@ bool ggml_common_quantize_0(
|
|||||||
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
|
||||||
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
|
||||||
|
case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
|
||||||
case GGML_FTYPE_UNKNOWN:
|
case GGML_FTYPE_UNKNOWN:
|
||||||
case GGML_FTYPE_ALL_F32:
|
case GGML_FTYPE_ALL_F32:
|
||||||
case GGML_FTYPE_MOSTLY_F16:
|
case GGML_FTYPE_MOSTLY_F16:
|
||||||
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||||
case GGML_FTYPE_MOSTLY_Q2_K:
|
|
||||||
case GGML_FTYPE_MOSTLY_Q3_K:
|
|
||||||
case GGML_FTYPE_MOSTLY_Q4_K:
|
|
||||||
case GGML_FTYPE_MOSTLY_Q5_K:
|
|
||||||
case GGML_FTYPE_MOSTLY_Q6_K:
|
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
|
||||||
return false;
|
return false;
|
||||||
@ -167,24 +172,17 @@ bool ggml_common_quantize_0(
|
|||||||
|
|
||||||
switch ((ggml_type) ttype) {
|
switch ((ggml_type) ttype) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
{
|
|
||||||
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q4_1:
|
case GGML_TYPE_Q4_1:
|
||||||
{
|
|
||||||
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q5_0:
|
case GGML_TYPE_Q5_0:
|
||||||
{
|
|
||||||
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
{
|
|
||||||
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
|
||||||
} break;
|
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
case GGML_TYPE_Q2_K:
|
||||||
|
case GGML_TYPE_Q3_K:
|
||||||
|
case GGML_TYPE_Q4_K:
|
||||||
|
case GGML_TYPE_Q5_K:
|
||||||
|
case GGML_TYPE_Q6_K:
|
||||||
{
|
{
|
||||||
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
|
cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
@ -192,11 +190,6 @@ bool ggml_common_quantize_0(
|
|||||||
case GGML_TYPE_I16:
|
case GGML_TYPE_I16:
|
||||||
case GGML_TYPE_I32:
|
case GGML_TYPE_I32:
|
||||||
case GGML_TYPE_Q8_1:
|
case GGML_TYPE_Q8_1:
|
||||||
case GGML_TYPE_Q2_K:
|
|
||||||
case GGML_TYPE_Q3_K:
|
|
||||||
case GGML_TYPE_Q4_K:
|
|
||||||
case GGML_TYPE_Q5_K:
|
|
||||||
case GGML_TYPE_Q6_K:
|
|
||||||
case GGML_TYPE_Q8_K:
|
case GGML_TYPE_Q8_K:
|
||||||
case GGML_TYPE_COUNT:
|
case GGML_TYPE_COUNT:
|
||||||
{
|
{
|
||||||
|
@ -139,10 +139,13 @@ void audio_async::callback(uint8_t * stream, int len) {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t n_samples = len / sizeof(float);
|
size_t n_samples = len / sizeof(float);
|
||||||
|
|
||||||
m_audio_new.resize(n_samples);
|
if (n_samples > m_audio.size()) {
|
||||||
memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
|
n_samples = m_audio.size();
|
||||||
|
|
||||||
|
stream += (len - (n_samples * sizeof(float)));
|
||||||
|
}
|
||||||
|
|
||||||
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
|
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
|
||||||
|
|
||||||
@ -153,7 +156,7 @@ void audio_async::callback(uint8_t * stream, int len) {
|
|||||||
const size_t n0 = m_audio.size() - m_audio_pos;
|
const size_t n0 = m_audio.size() - m_audio_pos;
|
||||||
|
|
||||||
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
|
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
|
||||||
memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
|
memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));
|
||||||
|
|
||||||
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
|
||||||
m_audio_len = m_audio.size();
|
m_audio_len = m_audio.size();
|
||||||
|
@ -41,7 +41,6 @@ private:
|
|||||||
std::mutex m_mutex;
|
std::mutex m_mutex;
|
||||||
|
|
||||||
std::vector<float> m_audio;
|
std::vector<float> m_audio;
|
||||||
std::vector<float> m_audio_new;
|
|
||||||
size_t m_audio_pos = 0;
|
size_t m_audio_pos = 0;
|
||||||
size_t m_audio_len = 0;
|
size_t m_audio_len = 0;
|
||||||
};
|
};
|
||||||
|
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Whisper models
|
# Whisper models
|
||||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||||
|
|
||||||
# list available models
|
# list available models
|
||||||
function list_models {
|
function list_models {
|
||||||
|
@ -62,8 +62,8 @@ struct whisper_params {
|
|||||||
int32_t progress_step = 5;
|
int32_t progress_step = 5;
|
||||||
int32_t max_context = -1;
|
int32_t max_context = -1;
|
||||||
int32_t max_len = 0;
|
int32_t max_len = 0;
|
||||||
int32_t best_of = 2;
|
int32_t best_of = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
|
||||||
int32_t beam_size = -1;
|
int32_t beam_size = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
|
||||||
|
|
||||||
float word_thold = 0.01f;
|
float word_thold = 0.01f;
|
||||||
float entropy_thold = 2.40f;
|
float entropy_thold = 2.40f;
|
||||||
@ -165,8 +165,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
|||||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||||
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
||||||
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
||||||
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
|
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
|
||||||
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
||||||
else {
|
else {
|
||||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
whisper_print_usage(argc, argv, params);
|
whisper_print_usage(argc, argv, params);
|
||||||
@ -925,9 +925,9 @@ int main(int argc, char ** argv) {
|
|||||||
if (params.detect_language) {
|
if (params.detect_language) {
|
||||||
params.language = "auto";
|
params.language = "auto";
|
||||||
}
|
}
|
||||||
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
|
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, %d beams + best of %d, lang = %s, task = %s, %stimestamps = %d ...\n",
|
||||||
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||||
params.n_threads, params.n_processors,
|
params.n_threads, params.n_processors, params.beam_size, params.best_of,
|
||||||
params.language.c_str(),
|
params.language.c_str(),
|
||||||
params.translate ? "translate" : "transcribe",
|
params.translate ? "translate" : "transcribe",
|
||||||
params.tinydiarize ? "tdrz = 1, " : "",
|
params.tinydiarize ? "tdrz = 1, " : "",
|
||||||
|
6
examples/server/CMakeLists.txt
Normal file
6
examples/server/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
set(TARGET server)
|
||||||
|
add_executable(${TARGET} server.cpp httplib.h json.hpp)
|
||||||
|
|
||||||
|
include(DefaultTargetOptions)
|
||||||
|
|
||||||
|
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
|
59
examples/server/README.md
Normal file
59
examples/server/README.md
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
# whisper.cpp http server
|
||||||
|
|
||||||
|
Simple http server. WAV Files are passed to the inference model via http requests.
|
||||||
|
|
||||||
|
```
|
||||||
|
./server -h
|
||||||
|
|
||||||
|
usage: ./bin/server [options]
|
||||||
|
|
||||||
|
options:
|
||||||
|
-h, --help [default] show this help message and exit
|
||||||
|
-t N, --threads N [4 ] number of threads to use during computation
|
||||||
|
-p N, --processors N [1 ] number of processors to use during computation
|
||||||
|
-ot N, --offset-t N [0 ] time offset in milliseconds
|
||||||
|
-on N, --offset-n N [0 ] segment index offset
|
||||||
|
-d N, --duration N [0 ] duration of audio to process in milliseconds
|
||||||
|
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
|
||||||
|
-ml N, --max-len N [0 ] maximum segment length in characters
|
||||||
|
-sow, --split-on-word [false ] split on word rather than on token
|
||||||
|
-bo N, --best-of N [2 ] number of best candidates to keep
|
||||||
|
-bs N, --beam-size N [-1 ] beam size for beam search
|
||||||
|
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
|
||||||
|
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
|
||||||
|
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
|
||||||
|
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
|
||||||
|
-tr, --translate [false ] translate from source language to english
|
||||||
|
-di, --diarize [false ] stereo audio diarization
|
||||||
|
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
|
||||||
|
-nf, --no-fallback [false ] do not use temperature fallback while decoding
|
||||||
|
-ps, --print-special [false ] print special tokens
|
||||||
|
-pc, --print-colors [false ] print colors
|
||||||
|
-pp, --print-progress [false ] print progress
|
||||||
|
-nt, --no-timestamps [false ] do not print timestamps
|
||||||
|
-l LANG, --language LANG [en ] spoken language ('auto' for auto-detect)
|
||||||
|
-dl, --detect-language [false ] exit after automatically detecting language
|
||||||
|
--prompt PROMPT [ ] initial prompt
|
||||||
|
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
|
||||||
|
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
|
||||||
|
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
|
||||||
|
--port PORT, [8080 ] Port number for the server
|
||||||
|
```
|
||||||
|
|
||||||
|
## request examples
|
||||||
|
|
||||||
|
**/inference**
|
||||||
|
```
|
||||||
|
curl 127.0.0.1:8080/inference \
|
||||||
|
-H "Content-Type: multipart/form-data" \
|
||||||
|
-F file="@<file-path>" \
|
||||||
|
-F temperature="0.2" \
|
||||||
|
-F response-format="json"
|
||||||
|
```
|
||||||
|
|
||||||
|
**/load**
|
||||||
|
```
|
||||||
|
curl 127.0.0.1:8080/load \
|
||||||
|
-H "Content-Type: multipart/form-data" \
|
||||||
|
-F model="<path-to-model-file>"
|
||||||
|
```
|
9262
examples/server/httplib.h
Normal file
9262
examples/server/httplib.h
Normal file
File diff suppressed because it is too large
Load Diff
24596
examples/server/json.hpp
Normal file
24596
examples/server/json.hpp
Normal file
File diff suppressed because it is too large
Load Diff
699
examples/server/server.cpp
Normal file
699
examples/server/server.cpp
Normal file
@ -0,0 +1,699 @@
|
|||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#include "whisper.h"
|
||||||
|
#include "httplib.h"
|
||||||
|
#include "json.hpp"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
#include <fstream>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using namespace httplib;
|
||||||
|
using json = nlohmann::json;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
|
||||||
|
// Lowest is red, middle is yellow, highest is green.
|
||||||
|
const std::vector<std::string> k_colors = {
|
||||||
|
"\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
|
||||||
|
"\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
|
||||||
|
};
|
||||||
|
|
||||||
|
// output formats
|
||||||
|
const std::string json_format = "json";
|
||||||
|
const std::string text_format = "text";
|
||||||
|
const std::string srt_format = "srt";
|
||||||
|
const std::string vjson_format = "verbose_json";
|
||||||
|
const std::string vtt_format = "vtt";
|
||||||
|
|
||||||
|
struct server_params
|
||||||
|
{
|
||||||
|
std::string hostname = "127.0.0.1";
|
||||||
|
std::string public_path = "examples/server/public";
|
||||||
|
|
||||||
|
int32_t port = 8080;
|
||||||
|
int32_t read_timeout = 600;
|
||||||
|
int32_t write_timeout = 600;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct whisper_params {
|
||||||
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
int32_t n_processors = 1;
|
||||||
|
int32_t offset_t_ms = 0;
|
||||||
|
int32_t offset_n = 0;
|
||||||
|
int32_t duration_ms = 0;
|
||||||
|
int32_t progress_step = 5;
|
||||||
|
int32_t max_context = -1;
|
||||||
|
int32_t max_len = 0;
|
||||||
|
int32_t best_of = 2;
|
||||||
|
int32_t beam_size = -1;
|
||||||
|
|
||||||
|
float word_thold = 0.01f;
|
||||||
|
float entropy_thold = 2.40f;
|
||||||
|
float logprob_thold = -1.00f;
|
||||||
|
float userdef_temp = 0.20f;
|
||||||
|
|
||||||
|
bool speed_up = false;
|
||||||
|
bool debug_mode = false;
|
||||||
|
bool translate = false;
|
||||||
|
bool detect_language = false;
|
||||||
|
bool diarize = false;
|
||||||
|
bool tinydiarize = false;
|
||||||
|
bool split_on_word = false;
|
||||||
|
bool no_fallback = false;
|
||||||
|
bool print_special = false;
|
||||||
|
bool print_colors = false;
|
||||||
|
bool print_progress = false;
|
||||||
|
bool no_timestamps = false;
|
||||||
|
bool use_gpu = true;
|
||||||
|
|
||||||
|
std::string language = "en";
|
||||||
|
std::string prompt = "";
|
||||||
|
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
|
||||||
|
std::string model = "models/ggml-base.en.bin";
|
||||||
|
|
||||||
|
std::string response_format = json_format;
|
||||||
|
|
||||||
|
// [TDRZ] speaker turn string
|
||||||
|
std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
|
||||||
|
|
||||||
|
std::string openvino_encode_device = "CPU";
|
||||||
|
};
|
||||||
|
|
||||||
|
// 500 -> 00:05.000
|
||||||
|
// 6000 -> 01:00.000
|
||||||
|
std::string to_timestamp(int64_t t, bool comma = false) {
|
||||||
|
int64_t msec = t * 10;
|
||||||
|
int64_t hr = msec / (1000 * 60 * 60);
|
||||||
|
msec = msec - hr * (1000 * 60 * 60);
|
||||||
|
int64_t min = msec / (1000 * 60);
|
||||||
|
msec = msec - min * (1000 * 60);
|
||||||
|
int64_t sec = msec / 1000;
|
||||||
|
msec = msec - sec * 1000;
|
||||||
|
|
||||||
|
char buf[32];
|
||||||
|
snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
|
||||||
|
|
||||||
|
return std::string(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
int timestamp_to_sample(int64_t t, int n_samples) {
|
||||||
|
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_file_exist(const char *fileName)
|
||||||
|
{
|
||||||
|
std::ifstream infile(fileName);
|
||||||
|
return infile.good();
|
||||||
|
}
|
||||||
|
|
||||||
|
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params,
|
||||||
|
const server_params& sparams) {
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "usage: %s [options] \n", argv[0]);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "options:\n");
|
||||||
|
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||||
|
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||||
|
fprintf(stderr, " -p N, --processors N [%-7d] number of processors to use during computation\n", params.n_processors);
|
||||||
|
fprintf(stderr, " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", params.offset_t_ms);
|
||||||
|
fprintf(stderr, " -on N, --offset-n N [%-7d] segment index offset\n", params.offset_n);
|
||||||
|
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
|
||||||
|
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
|
||||||
|
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
|
||||||
|
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
|
||||||
|
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
|
||||||
|
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
|
||||||
|
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
||||||
|
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
||||||
|
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
||||||
|
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||||
|
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
|
||||||
|
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||||
|
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
||||||
|
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
|
||||||
|
fprintf(stderr, " -nf, --no-fallback [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
|
||||||
|
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||||
|
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
|
||||||
|
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
|
||||||
|
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
|
||||||
|
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
|
||||||
|
fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
|
||||||
|
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
|
||||||
|
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||||
|
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
|
||||||
|
// server params
|
||||||
|
fprintf(stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
|
||||||
|
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
|
||||||
|
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
bool whisper_params_parse(int argc, char ** argv, whisper_params & params, server_params & sparams) {
|
||||||
|
for (int i = 1; i < argc; i++) {
|
||||||
|
std::string arg = argv[i];
|
||||||
|
|
||||||
|
if (arg == "-h" || arg == "--help") {
|
||||||
|
whisper_print_usage(argc, argv, params, sparams);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-p" || arg == "--processors") { params.n_processors = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-ot" || arg == "--offset-t") { params.offset_t_ms = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-on" || arg == "--offset-n") { params.offset_n = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-d" || arg == "--duration") { params.duration_ms = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-mc" || arg == "--max-context") { params.max_context = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-ml" || arg == "--max-len") { params.max_len = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-bo" || arg == "--best-of") { params.best_of = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-bs" || arg == "--beam-size") { params.beam_size = std::stoi(argv[++i]); }
|
||||||
|
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
||||||
|
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
||||||
|
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
||||||
|
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||||
|
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
|
||||||
|
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||||
|
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
||||||
|
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
|
||||||
|
else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
|
||||||
|
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
|
||||||
|
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
|
||||||
|
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||||
|
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
|
||||||
|
else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; }
|
||||||
|
else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; }
|
||||||
|
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||||
|
else if (arg == "-dl" || arg == "--detect-language") { params.detect_language = true; }
|
||||||
|
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
|
||||||
|
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||||
|
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
||||||
|
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
|
||||||
|
// server params
|
||||||
|
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
|
||||||
|
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
|
||||||
|
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||||
|
whisper_print_usage(argc, argv, params, sparams);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct whisper_print_user_data {
|
||||||
|
const whisper_params * params;
|
||||||
|
|
||||||
|
const std::vector<std::vector<float>> * pcmf32s;
|
||||||
|
int progress_prev;
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
|
||||||
|
std::string speaker = "";
|
||||||
|
const int64_t n_samples = pcmf32s[0].size();
|
||||||
|
|
||||||
|
const int64_t is0 = timestamp_to_sample(t0, n_samples);
|
||||||
|
const int64_t is1 = timestamp_to_sample(t1, n_samples);
|
||||||
|
|
||||||
|
double energy0 = 0.0f;
|
||||||
|
double energy1 = 0.0f;
|
||||||
|
|
||||||
|
for (int64_t j = is0; j < is1; j++) {
|
||||||
|
energy0 += fabs(pcmf32s[0][j]);
|
||||||
|
energy1 += fabs(pcmf32s[1][j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (energy0 > 1.1*energy1) {
|
||||||
|
speaker = "0";
|
||||||
|
} else if (energy1 > 1.1*energy0) {
|
||||||
|
speaker = "1";
|
||||||
|
} else {
|
||||||
|
speaker = "?";
|
||||||
|
}
|
||||||
|
|
||||||
|
//printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
|
||||||
|
|
||||||
|
if (!id_only) {
|
||||||
|
speaker.insert(0, "(speaker ");
|
||||||
|
speaker.append(")");
|
||||||
|
}
|
||||||
|
|
||||||
|
return speaker;
|
||||||
|
}
|
||||||
|
|
||||||
|
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
||||||
|
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
|
||||||
|
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
|
||||||
|
if (progress >= *progress_prev + progress_step) {
|
||||||
|
*progress_prev += progress_step;
|
||||||
|
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
|
||||||
|
const auto & params = *((whisper_print_user_data *) user_data)->params;
|
||||||
|
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
|
||||||
|
|
||||||
|
const int n_segments = whisper_full_n_segments(ctx);
|
||||||
|
|
||||||
|
std::string speaker = "";
|
||||||
|
|
||||||
|
int64_t t0 = 0;
|
||||||
|
int64_t t1 = 0;
|
||||||
|
|
||||||
|
// print the last n_new segments
|
||||||
|
const int s0 = n_segments - n_new;
|
||||||
|
|
||||||
|
if (s0 == 0) {
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = s0; i < n_segments; i++) {
|
||||||
|
if (!params.no_timestamps || params.diarize) {
|
||||||
|
t0 = whisper_full_get_segment_t0(ctx, i);
|
||||||
|
t1 = whisper_full_get_segment_t1(ctx, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!params.no_timestamps) {
|
||||||
|
printf("[%s --> %s] ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.diarize && pcmf32s.size() == 2) {
|
||||||
|
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.print_colors) {
|
||||||
|
for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
|
||||||
|
if (params.print_special == false) {
|
||||||
|
const whisper_token id = whisper_full_get_token_id(ctx, i, j);
|
||||||
|
if (id >= whisper_token_eot(ctx)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * text = whisper_full_get_token_text(ctx, i, j);
|
||||||
|
const float p = whisper_full_get_token_p (ctx, i, j);
|
||||||
|
|
||||||
|
const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
|
||||||
|
|
||||||
|
printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||||
|
|
||||||
|
printf("%s%s", speaker.c_str(), text);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.tinydiarize) {
|
||||||
|
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
|
||||||
|
printf("%s", params.tdrz_speaker_turn.c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// with timestamps or speakers: each segment on new line
|
||||||
|
if (!params.no_timestamps || params.diarize) {
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||||
|
std::stringstream result;
|
||||||
|
const int n_segments = whisper_full_n_segments(ctx);
|
||||||
|
for (int i = 0; i < n_segments; ++i) {
|
||||||
|
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||||
|
std::string speaker = "";
|
||||||
|
|
||||||
|
if (params.diarize && pcmf32s.size() == 2)
|
||||||
|
{
|
||||||
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||||
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||||
|
speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
|
||||||
|
}
|
||||||
|
|
||||||
|
result << speaker << text << "\n";
|
||||||
|
}
|
||||||
|
return result.str();
|
||||||
|
}
|
||||||
|
|
||||||
|
void get_req_parameters(const Request & req, whisper_params & params)
|
||||||
|
{
|
||||||
|
// user model configu.has_fileion
|
||||||
|
if (req.has_file("offset-t"))
|
||||||
|
{
|
||||||
|
params.offset_t_ms = std::stoi(req.get_file_value("offset-t").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("offset-n"))
|
||||||
|
{
|
||||||
|
params.offset_n = std::stoi(req.get_file_value("offset-n").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("duration"))
|
||||||
|
{
|
||||||
|
params.duration_ms = std::stoi(req.get_file_value("duration").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("max-context"))
|
||||||
|
{
|
||||||
|
params.max_context = std::stoi(req.get_file_value("max-context").content);
|
||||||
|
}
|
||||||
|
if (req.has_file("prompt"))
|
||||||
|
{
|
||||||
|
params.prompt = req.get_file_value("prompt").content;
|
||||||
|
}
|
||||||
|
if (req.has_file("response-format"))
|
||||||
|
{
|
||||||
|
params.response_format = req.get_file_value("response-format").content;
|
||||||
|
}
|
||||||
|
if (req.has_file("temerature"))
|
||||||
|
{
|
||||||
|
params.userdef_temp = std::stof(req.get_file_value("temperature").content);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
whisper_params params;
|
||||||
|
server_params sparams;
|
||||||
|
|
||||||
|
std::mutex whisper_mutex;
|
||||||
|
|
||||||
|
if (whisper_params_parse(argc, argv, params, sparams) == false) {
|
||||||
|
whisper_print_usage(argc, argv, params, sparams);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
|
||||||
|
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||||
|
whisper_print_usage(argc, argv, params, sparams);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.diarize && params.tinydiarize) {
|
||||||
|
fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
|
||||||
|
whisper_print_usage(argc, argv, params, sparams);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// whisper init
|
||||||
|
struct whisper_context_params cparams;
|
||||||
|
cparams.use_gpu = params.use_gpu;
|
||||||
|
|
||||||
|
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
||||||
|
|
||||||
|
if (ctx == nullptr) {
|
||||||
|
fprintf(stderr, "error: failed to initialize whisper context\n");
|
||||||
|
return 3;
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
|
||||||
|
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
|
||||||
|
|
||||||
|
Server svr;
|
||||||
|
|
||||||
|
std::string const default_content = "<html>hello</html>";
|
||||||
|
|
||||||
|
// this is only called if no index.html is found in the public --path
|
||||||
|
svr.Get("/", [&default_content](const Request &, Response &res){
|
||||||
|
res.set_content(default_content, "text/html");
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
|
||||||
|
svr.Post("/inference", [&](const Request &req, Response &res){
|
||||||
|
// aquire whisper model mutex lock
|
||||||
|
whisper_mutex.lock();
|
||||||
|
|
||||||
|
// first check user requested fields of the request
|
||||||
|
if (!req.has_file("file"))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "error: no 'file' field in the request\n");
|
||||||
|
const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
|
||||||
|
res.set_content(error_resp, "application/json");
|
||||||
|
whisper_mutex.unlock();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto audio_file = req.get_file_value("file");
|
||||||
|
|
||||||
|
// check non-required fields
|
||||||
|
get_req_parameters(req, params);
|
||||||
|
|
||||||
|
std::string filename{audio_file.filename};
|
||||||
|
printf("Received request: %s\n", filename.c_str());
|
||||||
|
|
||||||
|
// audio arrays
|
||||||
|
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||||
|
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||||
|
|
||||||
|
// write file to temporary file
|
||||||
|
std::ofstream temp_file{filename, std::ios::binary};
|
||||||
|
temp_file << audio_file.content;
|
||||||
|
|
||||||
|
// read wav content into pcmf32
|
||||||
|
if (!::read_wav(filename, pcmf32, pcmf32s, params.diarize)) {
|
||||||
|
fprintf(stderr, "error: failed to read WAV file '%s'\n", filename.c_str());
|
||||||
|
const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
|
||||||
|
res.set_content(error_resp, "application/json");
|
||||||
|
whisper_mutex.unlock();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// remove temp file
|
||||||
|
std::remove(filename.c_str());
|
||||||
|
|
||||||
|
printf("Successfully loaded %s\n", filename.c_str());
|
||||||
|
|
||||||
|
// print system information
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||||
|
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
|
||||||
|
}
|
||||||
|
|
||||||
|
// print some info about the processing
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
if (!whisper_is_multilingual(ctx)) {
|
||||||
|
if (params.language != "en" || params.translate) {
|
||||||
|
params.language = "en";
|
||||||
|
params.translate = false;
|
||||||
|
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (params.detect_language) {
|
||||||
|
params.language = "auto";
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
|
||||||
|
__func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||||
|
params.n_threads, params.n_processors,
|
||||||
|
params.language.c_str(),
|
||||||
|
params.translate ? "translate" : "transcribe",
|
||||||
|
params.tinydiarize ? "tdrz = 1, " : "",
|
||||||
|
params.no_timestamps ? 0 : 1);
|
||||||
|
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// run the inference
|
||||||
|
{
|
||||||
|
|
||||||
|
printf("Running whisper.cpp inference on %s\n", filename.c_str());
|
||||||
|
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||||
|
|
||||||
|
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
|
||||||
|
|
||||||
|
wparams.print_realtime = false;
|
||||||
|
wparams.print_progress = params.print_progress;
|
||||||
|
wparams.print_timestamps = !params.no_timestamps;
|
||||||
|
wparams.print_special = params.print_special;
|
||||||
|
wparams.translate = params.translate;
|
||||||
|
wparams.language = params.language.c_str();
|
||||||
|
wparams.detect_language = params.detect_language;
|
||||||
|
wparams.n_threads = params.n_threads;
|
||||||
|
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||||
|
wparams.offset_ms = params.offset_t_ms;
|
||||||
|
wparams.duration_ms = params.duration_ms;
|
||||||
|
|
||||||
|
wparams.thold_pt = params.word_thold;
|
||||||
|
wparams.split_on_word = params.split_on_word;
|
||||||
|
|
||||||
|
wparams.speed_up = params.speed_up;
|
||||||
|
wparams.debug_mode = params.debug_mode;
|
||||||
|
|
||||||
|
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
||||||
|
|
||||||
|
wparams.initial_prompt = params.prompt.c_str();
|
||||||
|
|
||||||
|
wparams.greedy.best_of = params.best_of;
|
||||||
|
wparams.beam_search.beam_size = params.beam_size;
|
||||||
|
|
||||||
|
wparams.temperature_inc = params.userdef_temp;
|
||||||
|
wparams.entropy_thold = params.entropy_thold;
|
||||||
|
wparams.logprob_thold = params.logprob_thold;
|
||||||
|
|
||||||
|
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
||||||
|
|
||||||
|
// this callback is called on each new segment
|
||||||
|
if (!wparams.print_realtime) {
|
||||||
|
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||||
|
wparams.new_segment_callback_user_data = &user_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wparams.print_progress) {
|
||||||
|
wparams.progress_callback = whisper_print_progress_callback;
|
||||||
|
wparams.progress_callback_user_data = &user_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// examples for abort mechanism
|
||||||
|
// in examples below, we do not abort the processing, but we could if the flag is set to true
|
||||||
|
|
||||||
|
// the callback is called before every encoder run - if it returns false, the processing is aborted
|
||||||
|
{
|
||||||
|
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||||
|
|
||||||
|
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||||
|
bool is_aborted = *(bool*)user_data;
|
||||||
|
return !is_aborted;
|
||||||
|
};
|
||||||
|
wparams.encoder_begin_callback_user_data = &is_aborted;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the callback is called before every computation - if it returns true, the computation is aborted
|
||||||
|
{
|
||||||
|
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||||
|
|
||||||
|
wparams.abort_callback = [](void * user_data) {
|
||||||
|
bool is_aborted = *(bool*)user_data;
|
||||||
|
return is_aborted;
|
||||||
|
};
|
||||||
|
wparams.abort_callback_user_data = &is_aborted;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
||||||
|
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
|
||||||
|
const std::string error_resp = "{\"error\":\"failed to process audio\"}";
|
||||||
|
res.set_content(error_resp, "application/json");
|
||||||
|
whisper_mutex.unlock();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// return results to user
|
||||||
|
if (params.response_format == text_format)
|
||||||
|
{
|
||||||
|
std::string results = output_str(ctx, params, pcmf32s);
|
||||||
|
res.set_content(results.c_str(), "text/html");
|
||||||
|
}
|
||||||
|
// TODO add more output formats
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::string results = output_str(ctx, params, pcmf32s);
|
||||||
|
json jres = json{
|
||||||
|
{"text", results}
|
||||||
|
};
|
||||||
|
res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
|
||||||
|
"application/json");
|
||||||
|
}
|
||||||
|
|
||||||
|
// return whisper model mutex lock
|
||||||
|
whisper_mutex.unlock();
|
||||||
|
});
|
||||||
|
svr.Post("/load", [&](const Request &req, Response &res){
|
||||||
|
whisper_mutex.lock();
|
||||||
|
if (!req.has_file("model"))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "error: no 'model' field in the request\n");
|
||||||
|
const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
|
||||||
|
res.set_content(error_resp, "application/json");
|
||||||
|
whisper_mutex.unlock();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
std::string model = req.get_file_value("model").content;
|
||||||
|
if (!is_file_exist(model.c_str()))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
|
||||||
|
const std::string error_resp = "{\"error\":\"model not found!\"}";
|
||||||
|
res.set_content(error_resp, "application/json");
|
||||||
|
whisper_mutex.unlock();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// clean up
|
||||||
|
whisper_free(ctx);
|
||||||
|
|
||||||
|
// whisper init
|
||||||
|
ctx = whisper_init_from_file_with_params(model.c_str(), cparams);
|
||||||
|
|
||||||
|
// TODO perhaps load prior model here instead of exit
|
||||||
|
if (ctx == nullptr) {
|
||||||
|
fprintf(stderr, "error: model init failed, no model loaded must exit\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
|
||||||
|
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
|
||||||
|
|
||||||
|
const std::string success = "Load was successful!";
|
||||||
|
res.set_content(success, "application/text");
|
||||||
|
|
||||||
|
// check if the model is in the file system
|
||||||
|
whisper_mutex.unlock();
|
||||||
|
});
|
||||||
|
|
||||||
|
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
|
||||||
|
const char fmt[] = "500 Internal Server Error\n%s";
|
||||||
|
char buf[BUFSIZ];
|
||||||
|
try {
|
||||||
|
std::rethrow_exception(std::move(ep));
|
||||||
|
} catch (std::exception &e) {
|
||||||
|
snprintf(buf, sizeof(buf), fmt, e.what());
|
||||||
|
} catch (...) {
|
||||||
|
snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
|
||||||
|
}
|
||||||
|
res.set_content(buf, "text/plain");
|
||||||
|
res.status = 500;
|
||||||
|
});
|
||||||
|
|
||||||
|
svr.set_error_handler([](const Request &, Response &res) {
|
||||||
|
if (res.status == 400) {
|
||||||
|
res.set_content("Invalid request", "text/plain");
|
||||||
|
} else if (res.status != 500) {
|
||||||
|
res.set_content("File Not Found", "text/plain");
|
||||||
|
res.status = 404;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// set timeouts and change hostname and port
|
||||||
|
svr.set_read_timeout(sparams.read_timeout);
|
||||||
|
svr.set_write_timeout(sparams.write_timeout);
|
||||||
|
|
||||||
|
if (!svr.bind_to_port(sparams.hostname, sparams.port))
|
||||||
|
{
|
||||||
|
fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
|
||||||
|
sparams.hostname.c_str(), sparams.port);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Set the base directory for serving static files
|
||||||
|
svr.set_base_dir(sparams.public_path);
|
||||||
|
|
||||||
|
// to make it ctrl+clickable:
|
||||||
|
printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
|
||||||
|
|
||||||
|
if (!svr.listen_after_bind())
|
||||||
|
{
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
whisper_print_timings(ctx);
|
||||||
|
whisper_free(ctx);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
@ -53,7 +53,7 @@ struct whisper_params {
|
|||||||
int32_t capture_id = -1;
|
int32_t capture_id = -1;
|
||||||
int32_t max_tokens = 32;
|
int32_t max_tokens = 32;
|
||||||
int32_t audio_ctx = 0;
|
int32_t audio_ctx = 0;
|
||||||
int32_t n_gpu_layers = 0;
|
int32_t n_gpu_layers = 999;
|
||||||
|
|
||||||
float vad_thold = 0.6f;
|
float vad_thold = 0.6f;
|
||||||
float freq_thold = 100.0f;
|
float freq_thold = 100.0f;
|
||||||
@ -136,7 +136,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
|||||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||||
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7s] number of layers to store in VRAM\n", params.n_gpu_layers);
|
fprintf(stderr, " -ngl N, --n-gpu-layers N [%-7d] number of layers to store in VRAM\n", params.n_gpu_layers);
|
||||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||||
@ -686,8 +686,8 @@ int main(int argc, char ** argv) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
text_to_speak = ::replace(text_to_speak, "\"", "");
|
text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
|
||||||
int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
|
int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
fprintf(stderr, "%s: failed to speak\n", __func__);
|
fprintf(stderr, "%s: failed to speak\n", __func__);
|
||||||
}
|
}
|
||||||
|
@ -21,7 +21,7 @@ help()
|
|||||||
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
|
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
|
||||||
echo "options:"
|
echo "options:"
|
||||||
echo "-s Step in seconds (default is $step)."
|
echo "-s Step in seconds (default is $step)."
|
||||||
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
|
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
|
||||||
echo "-t Number of threads to use."
|
echo "-t Number of threads to use."
|
||||||
echo "-h Print this help page."
|
echo "-h Print this help page."
|
||||||
echo
|
echo
|
||||||
|
@ -17,12 +17,12 @@ else
|
|||||||
encoder_only=$2
|
encoder_only=$2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
models=( \
|
models=( \
|
||||||
"tiny" "tiny-q4_0" "tiny-q4_1" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
|
"tiny" "tiny-q4_0" "tiny-q4_1" "tiny-q5_0" "tiny-q5_1" "tiny-q8_0" \
|
||||||
"base" "base-q4_0" "base-q4_1" "base-q5_0" "base-q5_1" "base-q8_0" \
|
"base" "base-q4_0" "base-q4_1" "base-q5_0" "base-q5_1" "base-q8_0" \
|
||||||
"small" "small-q4_0" "small-q4_1" "small-q5_0" "small-q5_1" "small-q8_0" \
|
"small" "small-q4_0" "small-q4_1" "small-q5_0" "small-q5_1" "small-q8_0" \
|
||||||
"medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
|
"medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" "medium-dis" \
|
||||||
"large" "large-q4_0" "large-q4_1" "large-q5_0" "large-q5_1" "large-q8_0" \
|
"large-v2" "large-v2-q4_0" "large-v2-q4_1" "large-v2-q5_0" "large-v2-q5_1" "large-v2-q8_0" "large-v2-dis" \
|
||||||
)
|
)
|
||||||
|
|
||||||
if [ "$encoder_only" -eq 0 ]; then
|
if [ "$encoder_only" -eq 0 ]; then
|
||||||
@ -44,8 +44,8 @@ if [ "$encoder_only" -eq 0 ]; then
|
|||||||
printf "\n"
|
printf "\n"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "PP" "Commit"
|
printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "Bch5" "PP" "Commit"
|
||||||
printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---"
|
printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" "---"
|
||||||
|
|
||||||
for model in "${models[@]}"; do
|
for model in "${models[@]}"; do
|
||||||
# actual run
|
# actual run
|
||||||
@ -56,6 +56,7 @@ for model in "${models[@]}"; do
|
|||||||
# parse the output:
|
# parse the output:
|
||||||
encode_time=$(echo "$output" | grep "encode time" | awk '{print $11}')
|
encode_time=$(echo "$output" | grep "encode time" | awk '{print $11}')
|
||||||
decode_time=$(echo "$output" | grep "decode time" | awk '{print $11}')
|
decode_time=$(echo "$output" | grep "decode time" | awk '{print $11}')
|
||||||
|
batchd_time=$(echo "$output" | grep "batchd time" | awk '{print $11}')
|
||||||
prompt_time=$(echo "$output" | grep "prompt time" | awk '{print $11}')
|
prompt_time=$(echo "$output" | grep "prompt time" | awk '{print $11}')
|
||||||
system_info=$(echo "$output" | grep "system_info")
|
system_info=$(echo "$output" | grep "system_info")
|
||||||
n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
|
n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
|
||||||
@ -94,6 +95,6 @@ for model in "${models[@]}"; do
|
|||||||
commit=$(git rev-parse --short HEAD)
|
commit=$(git rev-parse --short HEAD)
|
||||||
|
|
||||||
if [ $ret -eq 0 ]; then
|
if [ $ret -eq 0 ]; then
|
||||||
printf "| <todo> | <todo> | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$prompt_time" "$commit"
|
printf "| <todo> | <todo> | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||||
|
|
||||||
for model in "${models[@]}"; do
|
for model in "${models[@]}"; do
|
||||||
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
|
python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
|
||||||
|
23
ggml-alloc.c
23
ggml-alloc.c
@ -446,12 +446,14 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
|
|||||||
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
|
||||||
}
|
}
|
||||||
|
|
||||||
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) {
|
static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
|
||||||
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
ggml_tallocr_t alloc = node_tallocr(galloc, view);
|
||||||
|
|
||||||
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
//printf("init_view: %s from src %s\n", view->name, view->view_src->name);
|
||||||
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
|
||||||
view->backend = view->view_src->backend;
|
if (update_backend) {
|
||||||
|
view->backend = view->view_src->backend;
|
||||||
|
}
|
||||||
view->buffer = view->view_src->buffer;
|
view->buffer = view->view_src->buffer;
|
||||||
view->data = (char *)view->view_src->data + view->view_offs;
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
|
||||||
@ -469,7 +471,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
|||||||
|
|
||||||
if (node->data == NULL) {
|
if (node->data == NULL) {
|
||||||
if (ggml_is_view(node)) {
|
if (ggml_is_view(node)) {
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, true);
|
||||||
} else {
|
} else {
|
||||||
// see if we can reuse a parent's buffer (inplace)
|
// see if we can reuse a parent's buffer (inplace)
|
||||||
if (ggml_op_can_inplace(node->op)) {
|
if (ggml_op_can_inplace(node->op)) {
|
||||||
@ -499,15 +501,14 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
|
|||||||
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
|
||||||
node->view_src = view_src;
|
node->view_src = view_src;
|
||||||
view_src_hn->n_views += 1;
|
view_src_hn->n_views += 1;
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
|
||||||
node->view_src = parent;
|
node->view_src = parent;
|
||||||
p_hn->n_views += 1;
|
p_hn->n_views += 1;
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, false);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -537,7 +538,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|||||||
hash_get(galloc, view_src)->n_views += 1;
|
hash_get(galloc, view_src)->n_views += 1;
|
||||||
if (node->buffer == NULL && node->data != NULL) {
|
if (node->buffer == NULL && node->data != NULL) {
|
||||||
// view of a pre-allocated tensor, didn't call init_view() yet
|
// view of a pre-allocated tensor, didn't call init_view() yet
|
||||||
init_view(galloc, node);
|
init_view(galloc, node, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -548,7 +549,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
|
|||||||
}
|
}
|
||||||
hash_get(galloc, parent)->n_children += 1;
|
hash_get(galloc, parent)->n_children += 1;
|
||||||
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
|
||||||
init_view(galloc, parent);
|
init_view(galloc, parent, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -663,7 +664,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
|
|||||||
return max_size;
|
return max_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) {
|
void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
|
||||||
const size_t hash_size = hash_set.size;
|
const size_t hash_size = hash_set.size;
|
||||||
|
|
||||||
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
|
||||||
@ -686,7 +687,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
|
|||||||
// reset hash values
|
// reset hash values
|
||||||
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
|
||||||
|
|
||||||
galloc->hash_allocs = hash_node_alloct;
|
galloc->hash_allocs = hash_node_talloc;
|
||||||
|
|
||||||
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
ggml_tallocr_alloc_graph_impl(galloc, graph);
|
||||||
|
|
||||||
|
309
ggml-cuda.cu
309
ggml-cuda.cu
@ -39,7 +39,6 @@
|
|||||||
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
|
||||||
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
|
||||||
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
|
||||||
#define cudaDeviceGetMemPool hipDeviceGetMemPool
|
|
||||||
#define cudaDeviceProp hipDeviceProp_t
|
#define cudaDeviceProp hipDeviceProp_t
|
||||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||||
#define cudaError_t hipError_t
|
#define cudaError_t hipError_t
|
||||||
@ -49,7 +48,6 @@
|
|||||||
#define cudaEvent_t hipEvent_t
|
#define cudaEvent_t hipEvent_t
|
||||||
#define cudaEventDestroy hipEventDestroy
|
#define cudaEventDestroy hipEventDestroy
|
||||||
#define cudaFree hipFree
|
#define cudaFree hipFree
|
||||||
#define cudaFreeAsync hipFreeAsync
|
|
||||||
#define cudaFreeHost hipHostFree
|
#define cudaFreeHost hipHostFree
|
||||||
#define cudaGetDevice hipGetDevice
|
#define cudaGetDevice hipGetDevice
|
||||||
#define cudaGetDeviceCount hipGetDeviceCount
|
#define cudaGetDeviceCount hipGetDeviceCount
|
||||||
@ -57,7 +55,6 @@
|
|||||||
#define cudaGetErrorString hipGetErrorString
|
#define cudaGetErrorString hipGetErrorString
|
||||||
#define cudaGetLastError hipGetLastError
|
#define cudaGetLastError hipGetLastError
|
||||||
#define cudaMalloc hipMalloc
|
#define cudaMalloc hipMalloc
|
||||||
#define cudaMallocFromPoolAsync hipMallocFromPoolAsync
|
|
||||||
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
|
||||||
#define cudaMemcpy hipMemcpy
|
#define cudaMemcpy hipMemcpy
|
||||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||||
@ -66,9 +63,6 @@
|
|||||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||||
#define cudaMemcpyKind hipMemcpyKind
|
#define cudaMemcpyKind hipMemcpyKind
|
||||||
#define cudaMemPool_t hipMemPool_t
|
|
||||||
#define cudaMemPoolAttrReleaseThreshold hipMemPoolAttrReleaseThreshold
|
|
||||||
#define cudaMemPoolSetAttribute hipMemPoolSetAttribute
|
|
||||||
#define cudaMemset hipMemset
|
#define cudaMemset hipMemset
|
||||||
#define cudaMemsetAsync hipMemsetAsync
|
#define cudaMemsetAsync hipMemsetAsync
|
||||||
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
|
||||||
@ -94,6 +88,8 @@
|
|||||||
#define CC_OFFSET_AMD 1000000
|
#define CC_OFFSET_AMD 1000000
|
||||||
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
|
||||||
|
|
||||||
|
#define GGML_CUDA_MAX_NODES 8192
|
||||||
|
|
||||||
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
|
||||||
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
|
||||||
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
|
||||||
@ -188,11 +184,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|||||||
do { \
|
do { \
|
||||||
cudaError_t err_ = (err); \
|
cudaError_t err_ = (err); \
|
||||||
if (err_ != cudaSuccess) { \
|
if (err_ != cudaSuccess) { \
|
||||||
int dev_id; \
|
int id; \
|
||||||
cudaGetDevice(&dev_id); \
|
cudaGetDevice(&id); \
|
||||||
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
fprintf(stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
|
||||||
cudaGetErrorString(err_)); \
|
cudaGetErrorString(err_)); \
|
||||||
fprintf(stderr, "current device: %d\n", dev_id); \
|
fprintf(stderr, "current device: %d\n", id); \
|
||||||
exit(1); \
|
exit(1); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
@ -202,11 +198,11 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
|
|||||||
do { \
|
do { \
|
||||||
cublasStatus_t err_ = (err); \
|
cublasStatus_t err_ = (err); \
|
||||||
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
if (err_ != CUBLAS_STATUS_SUCCESS) { \
|
||||||
int dev_id; \
|
int id; \
|
||||||
cudaGetDevice(&dev_id); \
|
cudaGetDevice(&id); \
|
||||||
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
fprintf(stderr, "\ncuBLAS error %d at %s:%d: %s\n", \
|
||||||
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
err_, __FILE__, __LINE__, cublasGetStatusString(err_)); \
|
||||||
fprintf(stderr, "current device: %d\n", dev_id); \
|
fprintf(stderr, "current device: %d\n", id); \
|
||||||
exit(1); \
|
exit(1); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
@ -440,6 +436,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
|
|||||||
#define CUDA_MUL_BLOCK_SIZE 256
|
#define CUDA_MUL_BLOCK_SIZE 256
|
||||||
#define CUDA_GELU_BLOCK_SIZE 256
|
#define CUDA_GELU_BLOCK_SIZE 256
|
||||||
#define CUDA_SILU_BLOCK_SIZE 256
|
#define CUDA_SILU_BLOCK_SIZE 256
|
||||||
|
#define CUDA_RELU_BLOCK_SIZE 256
|
||||||
|
#define CUDA_SQR_BLOCK_SIZE 256
|
||||||
#define CUDA_CPY_BLOCK_SIZE 32
|
#define CUDA_CPY_BLOCK_SIZE 32
|
||||||
#define CUDA_SCALE_BLOCK_SIZE 256
|
#define CUDA_SCALE_BLOCK_SIZE 256
|
||||||
#define CUDA_CLAMP_BLOCK_SIZE 256
|
#define CUDA_CLAMP_BLOCK_SIZE 256
|
||||||
@ -472,7 +470,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
|
|||||||
|
|
||||||
#define MAX_STREAMS 8
|
#define MAX_STREAMS 8
|
||||||
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { nullptr };
|
||||||
static cudaMemPool_t g_cudaMemPools[GGML_CUDA_MAX_DEVICES] = { nullptr };
|
|
||||||
|
|
||||||
struct ggml_tensor_extra_gpu {
|
struct ggml_tensor_extra_gpu {
|
||||||
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
||||||
@ -561,6 +558,24 @@ static __global__ void silu_f32(const float * x, float * dst, const int k) {
|
|||||||
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
dst[i] = x[i] / (1.0f + expf(-x[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
dst[i] = fmaxf(x[i], 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static __global__ void sqr_f32(const float * x, float * dst, const int k) {
|
||||||
|
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (i >= k) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
dst[i] = x[i] * x[i];
|
||||||
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
@ -990,7 +1005,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|||||||
|
|
||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
@ -1094,7 +1109,7 @@ static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx,
|
|||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
@ -1198,7 +1213,7 @@ static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx,
|
|||||||
|
|
||||||
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) {
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
const int ib0 = row*num_blocks_per_row;
|
const int ib0 = row*num_blocks_per_row;
|
||||||
@ -1452,7 +1467,7 @@ static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx,
|
|||||||
|
|
||||||
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
|
||||||
|
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
if (row > nrows) return;
|
if (row > nrows) return;
|
||||||
|
|
||||||
const int num_blocks_per_row = ncols / QK_K;
|
const int num_blocks_per_row = ncols / QK_K;
|
||||||
@ -4262,7 +4277,7 @@ template <bool need_check> static __global__ void
|
|||||||
|
|
||||||
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
|
||||||
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
if (row >= nrows) {
|
if (row >= nrows) {
|
||||||
return;
|
return;
|
||||||
@ -4302,7 +4317,7 @@ template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
|
|||||||
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
|
||||||
// qk = quantized weights per x block
|
// qk = quantized weights per x block
|
||||||
// qr = number of quantized weights per data value in x block
|
// qr = number of quantized weights per data value in x block
|
||||||
const int row = blockIdx.y*blockDim.y + threadIdx.y;
|
const int row = blockIdx.x*blockDim.y + threadIdx.y;
|
||||||
|
|
||||||
if (row >= nrows) {
|
if (row >= nrows) {
|
||||||
return;
|
return;
|
||||||
@ -4741,7 +4756,7 @@ static __global__ void im2col_f32_f16(
|
|||||||
int ofs0, int ofs1, int IW, int IH, int CHW,
|
int ofs0, int ofs1, int IW, int IH, int CHW,
|
||||||
int s0, int s1, int p0, int p1, int d0, int d1) {
|
int s0, int s1, int p0, int p1, int d0, int d1) {
|
||||||
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
const int iiw = blockIdx.z * s0 + threadIdx.z * d0 - p0;
|
||||||
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
const int iih = blockIdx.y * s1 + threadIdx.y * d1 - p1;
|
||||||
|
|
||||||
const int offset_dst =
|
const int offset_dst =
|
||||||
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
(threadIdx.x * gridDim.y * gridDim.z + blockIdx.y * gridDim.z + blockIdx.z) * CHW +
|
||||||
@ -4793,6 +4808,16 @@ static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|||||||
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
|
||||||
|
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||||
|
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
|
||||||
|
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||||
|
}
|
||||||
|
|
||||||
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
GGML_ASSERT(ncols % WARP_SIZE == 0);
|
||||||
if (ncols < 1024) {
|
if (ncols < 1024) {
|
||||||
@ -4901,7 +4926,8 @@ static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cu
|
|||||||
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
// the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
|
||||||
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
@ -4910,7 +4936,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y,
|
|||||||
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
@ -4919,7 +4945,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y,
|
|||||||
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
@ -4928,7 +4954,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y,
|
|||||||
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
@ -4937,7 +4963,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y,
|
|||||||
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
@ -4947,7 +4973,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, f
|
|||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(32, ny, 1);
|
const dim3 block_dims(32, ny, 1);
|
||||||
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
@ -4956,7 +4982,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, f
|
|||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(32, ny, 1);
|
const dim3 block_dims(32, ny, 1);
|
||||||
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
@ -4965,7 +4991,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, f
|
|||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(32, ny, 1);
|
const dim3 block_dims(32, ny, 1);
|
||||||
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
@ -4980,7 +5006,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
const int ny = 2 / K_QUANTS_PER_ITERATION;
|
||||||
const int block_num_y = (nrows + ny - 1) / ny;
|
const int block_num_y = (nrows + ny - 1) / ny;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(32, ny, 1);
|
const dim3 block_dims(32, ny, 1);
|
||||||
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
}
|
}
|
||||||
@ -4988,7 +5014,7 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, f
|
|||||||
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK4_0 == 0);
|
GGML_ASSERT(ncols % QK4_0 == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -4997,7 +5023,7 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK4_1 == 0);
|
GGML_ASSERT(ncols % QK4_1 == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5006,7 +5032,7 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK5_0 == 0);
|
GGML_ASSERT(ncols % QK5_0 == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5015,7 +5041,7 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK5_1 == 0);
|
GGML_ASSERT(ncols % QK5_1 == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5024,7 +5050,7 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK8_0 == 0);
|
GGML_ASSERT(ncols % QK8_0 == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5033,7 +5059,7 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5042,7 +5068,7 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5051,7 +5077,7 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5060,7 +5086,7 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5069,7 +5095,7 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float *
|
|||||||
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % QK_K == 0);
|
GGML_ASSERT(ncols % QK_K == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
|
||||||
@ -5088,7 +5114,7 @@ static void convert_fp32_to_fp16_cuda(const void * vx, half * y, const int k, cu
|
|||||||
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
|
||||||
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
|
||||||
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
|
||||||
const dim3 block_nums(1, block_num_y, 1);
|
const dim3 block_nums(block_num_y, 1, 1);
|
||||||
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
|
||||||
dequantize_mul_mat_vec<1, 1, convert_f16>
|
dequantize_mul_mat_vec<1, 1, convert_f16>
|
||||||
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
|
||||||
@ -5825,16 +5851,6 @@ static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
|
|||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void * ggml_cuda_pool_malloc_async(size_t size, size_t * actual_size, int id, cudaStream_t stream) {
|
|
||||||
if (g_cudaMemPools[id] == nullptr) {
|
|
||||||
return ggml_cuda_pool_malloc(size, actual_size);
|
|
||||||
}
|
|
||||||
void *ptr;
|
|
||||||
CUDA_CHECK(cudaMallocFromPoolAsync(&ptr, size, g_cudaMemPools[id], stream));
|
|
||||||
*actual_size = size;
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
||||||
scoped_spin_lock lock(g_cuda_pool_lock);
|
scoped_spin_lock lock(g_cuda_pool_lock);
|
||||||
int id;
|
int id;
|
||||||
@ -5852,12 +5868,10 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
|
|||||||
CUDA_CHECK(cudaFree(ptr));
|
CUDA_CHECK(cudaFree(ptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool g_cublas_loaded = false;
|
||||||
|
|
||||||
static void ggml_cuda_pool_free_async(void * ptr, size_t actual_size, int id, cudaStream_t stream) {
|
bool ggml_cublas_loaded(void) {
|
||||||
if (g_cudaMemPools[id] == nullptr) {
|
return g_cublas_loaded;
|
||||||
return ggml_cuda_pool_free(ptr, actual_size);
|
|
||||||
}
|
|
||||||
CUDA_CHECK(cudaFreeAsync(ptr, stream));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_init_cublas() {
|
void ggml_init_cublas() {
|
||||||
@ -5872,7 +5886,12 @@ void ggml_init_cublas() {
|
|||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
CUDA_CHECK(cudaGetDeviceCount(&g_device_count));
|
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
||||||
|
initialized = true;
|
||||||
|
g_cublas_loaded = false;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
#if defined(GGML_CUDA_FORCE_MMQ)
|
||||||
@ -5914,19 +5933,13 @@ void ggml_init_cublas() {
|
|||||||
// create cublas handle
|
// create cublas handle
|
||||||
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id]));
|
||||||
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
|
CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH));
|
||||||
|
|
||||||
// configure memory pool
|
|
||||||
cudaError_t err = cudaDeviceGetMemPool(&g_cudaMemPools[id], id);
|
|
||||||
if (err == cudaSuccess) {
|
|
||||||
size_t treshold = UINT64_MAX;
|
|
||||||
CUDA_CHECK(cudaMemPoolSetAttribute(g_cudaMemPools[id], cudaMemPoolAttrReleaseThreshold, &treshold));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// configure logging to stdout
|
// configure logging to stdout
|
||||||
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
||||||
|
|
||||||
initialized = true;
|
initialized = true;
|
||||||
|
g_cublas_loaded = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6129,6 +6142,9 @@ inline void ggml_cuda_op_add(
|
|||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
|
||||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
@ -6193,6 +6209,34 @@ inline void ggml_cuda_op_silu(
|
|||||||
(void) src1_dd;
|
(void) src1_dd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline void ggml_cuda_op_relu(
|
||||||
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
|
(void) src1;
|
||||||
|
(void) dst;
|
||||||
|
(void) src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void ggml_cuda_op_sqr(
|
||||||
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
|
||||||
|
|
||||||
|
(void) src1;
|
||||||
|
(void) dst;
|
||||||
|
(void) src1_dd;
|
||||||
|
}
|
||||||
|
|
||||||
inline void ggml_cuda_op_norm(
|
inline void ggml_cuda_op_norm(
|
||||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
||||||
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
|
||||||
@ -6514,7 +6558,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
|
||||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||||
size_t ne = row_diff*ne00;
|
size_t ne = row_diff*ne00;
|
||||||
src0_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src0_as, id, stream);
|
src0_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src0_as);
|
||||||
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
|
||||||
}
|
}
|
||||||
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16;
|
||||||
@ -6525,12 +6569,12 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||||||
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
|
||||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||||
size_t ne = src1_ncols*ne10;
|
size_t ne = src1_ncols*ne10;
|
||||||
src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &src1_as, id, stream);
|
src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &src1_as);
|
||||||
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
|
||||||
}
|
}
|
||||||
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16;
|
||||||
size_t dst_f16_as = 0;
|
size_t dst_as = 0;
|
||||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(row_diff*src1_ncols * sizeof(half), &dst_f16_as, id, stream);
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(row_diff*src1_ncols * sizeof(half), &dst_as);
|
||||||
|
|
||||||
const half alpha_f16 = 1.0f;
|
const half alpha_f16 = 1.0f;
|
||||||
const half beta_f16 = 0.0f;
|
const half beta_f16 = 0.0f;
|
||||||
@ -6548,15 +6592,14 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
|
||||||
|
|
||||||
if (dst_f16_as != 0) {
|
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||||
ggml_cuda_pool_free_async(dst_f16, dst_f16_as, id, stream);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (src0_as != 0) {
|
if (src0_as != 0) {
|
||||||
ggml_cuda_pool_free_async(src0_as_f16, src0_as, id, stream);
|
ggml_cuda_pool_free(src0_as_f16, src0_as);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src1_as != 0) {
|
if (src1_as != 0) {
|
||||||
ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, stream);
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -6566,7 +6609,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||||||
if (src0->type != GGML_TYPE_F32) {
|
if (src0->type != GGML_TYPE_F32) {
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
|
||||||
GGML_ASSERT(to_fp32_cuda != nullptr);
|
GGML_ASSERT(to_fp32_cuda != nullptr);
|
||||||
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc_async(row_diff*ne00 * sizeof(float), &src0_as, id, stream); // NOLINT
|
src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
|
||||||
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
|
||||||
}
|
}
|
||||||
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
|
||||||
@ -6583,7 +6626,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||||||
&beta, dst_dd_i, ldc));
|
&beta, dst_dd_i, ldc));
|
||||||
|
|
||||||
if (src0_as != 0) {
|
if (src0_as != 0) {
|
||||||
ggml_cuda_pool_free_async(src0_ddq_as_f32, src0_as, id, stream);
|
ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -7008,6 +7051,8 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
||||||
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
||||||
|
|
||||||
|
int used_devices = 0;
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
// by default, use all rows
|
// by default, use all rows
|
||||||
row_low[id] = 0;
|
row_low[id] = 0;
|
||||||
@ -7035,6 +7080,8 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
used_devices++;
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||||
|
|
||||||
@ -7045,22 +7092,21 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
src0_dd[id] = (char *) src0_extra->data_device[id];
|
src0_dd[id] = (char *) src0_extra->data_device[id];
|
||||||
} else {
|
} else {
|
||||||
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
|
||||||
src0_dd[id] = (char *) ggml_cuda_pool_malloc_async(ggml_nbytes(src0), &src0_as[id], id, stream);
|
src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (src1_on_device && src1_is_contiguous) {
|
if (src1_on_device && src1_is_contiguous) {
|
||||||
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
src1_ddf[id] = (float *) src1_extra->data_device[id];
|
||||||
} else {
|
} else {
|
||||||
src1_ddf[id] = (float *) ggml_cuda_pool_malloc_async(ggml_nbytes(src1), &src1_asf[id], id, stream);
|
src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (convert_src1_to_q8_1) {
|
if (convert_src1_to_q8_1) {
|
||||||
const size_t size_dst_ddq = nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs;
|
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
|
||||||
src1_ddq[id] = (char *) ggml_cuda_pool_malloc_async(size_dst_ddq, &src1_asq[id], id, stream);
|
|
||||||
|
|
||||||
if (src1_on_device && src1_is_contiguous) {
|
if (src1_on_device && src1_is_contiguous) {
|
||||||
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
|
||||||
// CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -7068,18 +7114,18 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
dst_dd[id] = (float *) dst_extra->data_device[id];
|
dst_dd[id] = (float *) dst_extra->data_device[id];
|
||||||
} else {
|
} else {
|
||||||
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
|
const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
|
||||||
dst_dd[id] = (float *) ggml_cuda_pool_malloc_async(size_dst_ddf, &dst_as[id], id, stream);
|
dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// if multiple devices are used they need to wait for the main device
|
// if multiple devices are used they need to wait for the main device
|
||||||
// here an event is recorded that signals that the main device has finished calculating the input data
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
||||||
if (split && g_device_count > 1) {
|
if (split && used_devices > 1) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
||||||
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
||||||
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
||||||
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
||||||
@ -7194,6 +7240,27 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
|
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||||
|
|
||||||
|
// free buffers again when done
|
||||||
|
if (src0_as[id] > 0) {
|
||||||
|
ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
|
||||||
|
}
|
||||||
|
if (src1_asf[id] > 0) {
|
||||||
|
ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
|
||||||
|
}
|
||||||
|
if (src1_asq[id] > 0) {
|
||||||
|
ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
|
||||||
|
}
|
||||||
|
if (dst_as[id] > 0) {
|
||||||
|
ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// main device waits for all other devices to be finished
|
// main device waits for all other devices to be finished
|
||||||
if (split && g_device_count > 1) {
|
if (split && g_device_count > 1) {
|
||||||
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
|
||||||
@ -7201,6 +7268,9 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
|
if (row_low[id] == row_high[id]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (int64_t is = 0; is < is_max; ++is) {
|
for (int64_t is = 0; is < is_max; ++is) {
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
||||||
}
|
}
|
||||||
@ -7211,21 +7281,6 @@ static void ggml_cuda_op_mul_mat(
|
|||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
CUDA_CHECK(cudaDeviceSynchronize());
|
CUDA_CHECK(cudaDeviceSynchronize());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
|
||||||
if (src0_as[id] > 0) {
|
|
||||||
ggml_cuda_pool_free_async(src0_dd[id], src0_as[id], id, g_cudaStreams[id][0]);
|
|
||||||
}
|
|
||||||
if (src1_asf[id] > 0) {
|
|
||||||
ggml_cuda_pool_free_async(src1_ddf[id], src1_asf[id], id, g_cudaStreams[id][0]);
|
|
||||||
}
|
|
||||||
if (src1_asq[id] > 0) {
|
|
||||||
ggml_cuda_pool_free_async(src1_ddq[id], src1_asq[id], id, g_cudaStreams[id][0]);
|
|
||||||
}
|
|
||||||
if (dst_as[id] > 0) {
|
|
||||||
ggml_cuda_pool_free_async(dst_dd[id], dst_as[id], id, g_cudaStreams[id][0]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
@ -7252,6 +7307,14 @@ static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, g
|
|||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
|
||||||
|
}
|
||||||
|
|
||||||
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
|
||||||
}
|
}
|
||||||
@ -7261,6 +7324,8 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
const int64_t ne0 = dst->ne[0];
|
||||||
@ -7412,11 +7477,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
GGML_ASSERT(to_fp16_cuda != nullptr);
|
GGML_ASSERT(to_fp16_cuda != nullptr);
|
||||||
|
|
||||||
size_t src1_as = 0;
|
size_t src1_as = 0;
|
||||||
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc_async(ne1 * sizeof(half), &src1_as, id, main_stream);
|
half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
|
||||||
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
|
||||||
|
|
||||||
size_t dst_as = 0;
|
size_t dst_as = 0;
|
||||||
half * dst_f16 = (half *) ggml_cuda_pool_malloc_async(ne * sizeof(half), &dst_as, id, main_stream);
|
half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
|
||||||
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
@ -7470,8 +7535,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
size_t ptrs_src_s = 0;
|
size_t ptrs_src_s = 0;
|
||||||
size_t ptrs_dst_s = 0;
|
size_t ptrs_dst_s = 0;
|
||||||
|
|
||||||
ptrs_src = (const void **) ggml_cuda_pool_malloc_async(2*ne23*sizeof(void *), &ptrs_src_s, id, main_stream);
|
ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
|
||||||
ptrs_dst = ( void **) ggml_cuda_pool_malloc_async(1*ne23*sizeof(void *), &ptrs_dst_s, id, main_stream);
|
ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
|
||||||
|
|
||||||
dim3 block_dims(ne13, ne12);
|
dim3 block_dims(ne13, ne12);
|
||||||
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>(
|
||||||
@ -7484,6 +7549,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
dst->nb[2], dst->nb[3],
|
dst->nb[2], dst->nb[3],
|
||||||
r2, r3);
|
r2, r3);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
CUDA_CHECK(cudaGetLastError());
|
||||||
|
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
ne01, ne11, ne10,
|
ne01, ne11, ne10,
|
||||||
@ -7495,30 +7561,29 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
|||||||
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
CUBLAS_GEMM_DEFAULT_TENSOR_OP));
|
||||||
|
|
||||||
if (ptrs_src_s != 0) {
|
if (ptrs_src_s != 0) {
|
||||||
ggml_cuda_pool_free_async(ptrs_src, ptrs_src_s, id, main_stream);
|
ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
|
||||||
}
|
}
|
||||||
if (ptrs_dst_s != 0) {
|
if (ptrs_dst_s != 0) {
|
||||||
ggml_cuda_pool_free_async(ptrs_dst, ptrs_dst_s, id, main_stream);
|
ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
|
||||||
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
|
||||||
if (src1_as != 0) {
|
|
||||||
ggml_cuda_pool_free_async(src1_as_f16, src1_as, id, main_stream);
|
ggml_cuda_pool_free(src1_as_f16, src1_as);
|
||||||
}
|
ggml_cuda_pool_free(dst_f16, dst_as);
|
||||||
if (dst_as != 0) {
|
|
||||||
ggml_cuda_pool_free_async(dst_f16, dst_as, id, main_stream);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
const bool all_on_device =
|
||||||
(src0->backend == GGML_BACKEND_GPU) &&
|
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||||
(src1->backend == GGML_BACKEND_GPU) &&
|
(src1->backend == GGML_BACKEND_GPU) &&
|
||||||
( dst->backend == GGML_BACKEND_GPU);
|
( dst->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||||
@ -7540,13 +7605,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||||
|
|
||||||
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// KQ single-batch
|
||||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||||
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||||
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||||
} else if (src0->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F32) {
|
||||||
@ -7667,7 +7732,7 @@ static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1,
|
|||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -7782,11 +7847,11 @@ static size_t g_temp_tensor_extra_index = 0;
|
|||||||
|
|
||||||
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
if (g_temp_tensor_extras == nullptr) {
|
if (g_temp_tensor_extras == nullptr) {
|
||||||
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
|
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t alloc_index = g_temp_tensor_extra_index;
|
size_t alloc_index = g_temp_tensor_extra_index;
|
||||||
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
|
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
||||||
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
@ -7953,6 +8018,8 @@ void ggml_cuda_free_scratch() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||||
|
if (!g_cublas_loaded) return false;
|
||||||
|
|
||||||
ggml_cuda_func_t func;
|
ggml_cuda_func_t func;
|
||||||
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
||||||
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
||||||
@ -7995,6 +8062,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|||||||
case GGML_UNARY_OP_SILU:
|
case GGML_UNARY_OP_SILU:
|
||||||
func = ggml_cuda_silu;
|
func = ggml_cuda_silu;
|
||||||
break;
|
break;
|
||||||
|
case GGML_UNARY_OP_RELU:
|
||||||
|
func = ggml_cuda_relu;
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
} break;
|
} break;
|
||||||
@ -8013,6 +8083,9 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
|
|||||||
case GGML_OP_SCALE:
|
case GGML_OP_SCALE:
|
||||||
func = ggml_cuda_scale;
|
func = ggml_cuda_scale;
|
||||||
break;
|
break;
|
||||||
|
case GGML_OP_SQR:
|
||||||
|
func = ggml_cuda_sqr;
|
||||||
|
break;
|
||||||
case GGML_OP_CLAMP:
|
case GGML_OP_CLAMP:
|
||||||
if (!any_on_device) {
|
if (!any_on_device) {
|
||||||
return false;
|
return false;
|
||||||
@ -8105,11 +8178,11 @@ struct ggml_backend_buffer_context_cuda {
|
|||||||
|
|
||||||
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
|
||||||
if (temp_tensor_extras == nullptr) {
|
if (temp_tensor_extras == nullptr) {
|
||||||
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_DEFAULT_GRAPH_SIZE];
|
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t alloc_index = temp_tensor_extra_index;
|
size_t alloc_index = temp_tensor_extra_index;
|
||||||
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_DEFAULT_GRAPH_SIZE;
|
temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
|
||||||
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
|
||||||
memset(extra, 0, sizeof(*extra));
|
memset(extra, 0, sizeof(*extra));
|
||||||
|
|
||||||
|
@ -17,7 +17,12 @@ extern "C" {
|
|||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
|
// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
|
||||||
GGML_API void ggml_init_cublas(void);
|
GGML_API void ggml_init_cublas(void);
|
||||||
|
|
||||||
|
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
||||||
|
GGML_API bool ggml_cublas_loaded(void);
|
||||||
|
|
||||||
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
||||||
GGML_API void ggml_cuda_host_free(void * ptr);
|
GGML_API void ggml_cuda_host_free(void * ptr);
|
||||||
|
|
||||||
|
18
ggml-metal.m
18
ggml-metal.m
@ -346,9 +346,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
||||||
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||||
if (ctx->device.maxTransferRate != 0) {
|
if (ctx->device.maxTransferRate != 0) {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
|
||||||
} else {
|
} else {
|
||||||
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
GGML_METAL_LOG_INFO("%s: maxTransferRate = built-in GPU\n", __func__);
|
||||||
}
|
}
|
||||||
@ -541,11 +541,11 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);
|
||||||
|
|
||||||
++ctx->n_buffers;
|
++ctx->n_buffers;
|
||||||
} else {
|
} else {
|
||||||
@ -565,11 +565,11 @@ bool ggml_metal_add_buffer(
|
|||||||
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
|
||||||
if (i + size_step < size) {
|
if (i + size_step < size) {
|
||||||
GGML_METAL_LOG_INFO("\n");
|
GGML_METAL_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
@ -580,8 +580,8 @@ bool ggml_metal_add_buffer(
|
|||||||
|
|
||||||
#if TARGET_OS_OSX
|
#if TARGET_OS_OSX
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
ctx->device.currentAllocatedSize / 1e6,
|
||||||
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
ctx->device.recommendedMaxWorkingSetSize / 1e6);
|
||||||
|
|
||||||
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
||||||
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
@ -589,7 +589,7 @@ bool ggml_metal_add_buffer(
|
|||||||
GGML_METAL_LOG_INFO("\n");
|
GGML_METAL_LOG_INFO("\n");
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
|
|||||||
float max = x[0];
|
float max = x[0];
|
||||||
float sum_w = weights[0];
|
float sum_w = weights[0];
|
||||||
float sum_x = sum_w * x[0];
|
float sum_x = sum_w * x[0];
|
||||||
|
#ifdef HAVE_BUGGY_APPLE_LINKER
|
||||||
|
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
||||||
|
for (volatile int i = 1; i < n; ++i) {
|
||||||
|
#else
|
||||||
for (int i = 1; i < n; ++i) {
|
for (int i = 1; i < n; ++i) {
|
||||||
|
#endif
|
||||||
if (x[i] < min) min = x[i];
|
if (x[i] < min) min = x[i];
|
||||||
if (x[i] > max) max = x[i];
|
if (x[i] > max) max = x[i];
|
||||||
float w = weights[i];
|
float w = weights[i];
|
||||||
|
341
ggml.c
341
ggml.c
@ -5024,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
|
|||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow,
|
||||||
float xpos_base,
|
float xpos_base,
|
||||||
bool xpos_down) {
|
bool xpos_down) {
|
||||||
GGML_ASSERT(ggml_is_vector(b));
|
GGML_ASSERT(ggml_is_vector(b));
|
||||||
@ -5042,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(
|
|||||||
|
|
||||||
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
struct ggml_tensor * result = ggml_dup_tensor(ctx, a);
|
||||||
|
|
||||||
int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
|
int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
|
||||||
memcpy(params + 4, &freq_base, sizeof(float));
|
memcpy(params + 5, &freq_base, sizeof(float));
|
||||||
memcpy(params + 5, &freq_scale, sizeof(float));
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
||||||
memcpy(params + 6, &xpos_base, sizeof(float));
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
||||||
memcpy(params + 7, &xpos_down, sizeof(bool));
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
||||||
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
||||||
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
||||||
|
memcpy(params + 11, &xpos_base, sizeof(float));
|
||||||
|
memcpy(params + 12, &xpos_down, sizeof(bool));
|
||||||
ggml_set_op_params(result, params, sizeof(params));
|
ggml_set_op_params(result, params, sizeof(params));
|
||||||
|
|
||||||
result->op = GGML_OP_ROPE_BACK;
|
result->op = GGML_OP_ROPE_BACK;
|
||||||
@ -9376,7 +9385,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
static void ggml_compute_forward_mul_mat(
|
static void ggml_compute_forward_mul_mat(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
@ -10946,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst,
|
||||||
|
const bool forward) {
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -11005,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
// backward process uses inverse rotation by cos and sin.
|
||||||
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
||||||
|
// this essentially just switches the sign of sin.
|
||||||
|
const float sin_sign = forward ? 1.0f : -1.0f;
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
const int32_t * pos = (const int32_t *) src1->data;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
@ -11021,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||||
const float cos_theta = cosf(theta_base);
|
const float cos_theta = cosf(theta_base);
|
||||||
const float sin_theta = sinf(theta_base);
|
const float sin_theta = sinf(theta_base) * sin_sign;
|
||||||
const float cos_block_theta = cosf(block_theta);
|
const float cos_block_theta = cosf(block_theta);
|
||||||
const float sin_block_theta = sinf(block_theta);
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
block_theta *= theta_scale;
|
block_theta *= theta_scale;
|
||||||
@ -11047,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
// zeta scaling for xPos only:
|
// zeta scaling for xPos only:
|
||||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
||||||
@ -11077,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
|
|||||||
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
@ -11102,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
const struct ggml_tensor * src1,
|
const struct ggml_tensor * src1,
|
||||||
struct ggml_tensor * dst) {
|
struct ggml_tensor * dst,
|
||||||
|
const bool forward) {
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -11154,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
const bool is_neox = mode & 2;
|
const bool is_neox = mode & 2;
|
||||||
const bool is_glm = mode & 4;
|
const bool is_glm = mode & 4;
|
||||||
|
|
||||||
|
// backward process uses inverse rotation by cos and sin.
|
||||||
|
// cos and sin build a rotation matrix, where the inverse is the transpose.
|
||||||
|
// this essentially just switches the sign of sin.
|
||||||
|
const float sin_sign = forward ? 1.0f : -1.0f;
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
const int32_t * pos = (const int32_t *) src1->data;
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
||||||
@ -11170,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
float block_theta = MAX(p - (n_ctx - 2), 0);
|
float block_theta = MAX(p - (n_ctx - 2), 0);
|
||||||
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
|
||||||
const float cos_theta = cosf(theta_base);
|
const float cos_theta = cosf(theta_base);
|
||||||
const float sin_theta = sinf(theta_base);
|
const float sin_theta = sinf(theta_base) * sin_sign;
|
||||||
const float cos_block_theta = cosf(block_theta);
|
const float cos_block_theta = cosf(block_theta);
|
||||||
const float sin_block_theta = sinf(block_theta);
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
block_theta *= theta_scale;
|
block_theta *= theta_scale;
|
||||||
@ -11196,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
@ -11222,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
|
|||||||
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
|
sin_theta *= sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
@ -11251,11 +11275,11 @@ static void ggml_compute_forward_rope(
|
|||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_f16(params, src0, src1, dst);
|
ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_f32(params, src0, src1, dst);
|
ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
@ -11266,216 +11290,6 @@ static void ggml_compute_forward_rope(
|
|||||||
|
|
||||||
// ggml_compute_forward_rope_back
|
// ggml_compute_forward_rope_back
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_back_f32(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
const struct ggml_tensor * src0,
|
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// y = rope(x, src1)
|
|
||||||
// dx = rope_back(dy, src1)
|
|
||||||
// src0 is dy, src1 contains options
|
|
||||||
|
|
||||||
float freq_base;
|
|
||||||
float freq_scale;
|
|
||||||
|
|
||||||
// these two only relevant for xPos RoPE:
|
|
||||||
float xpos_base;
|
|
||||||
bool xpos_down;
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
|
||||||
const int n_ctx = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
|
|
||||||
memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float));
|
|
||||||
memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
|
|
||||||
memcpy(&xpos_base, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
||||||
memcpy(&xpos_down, (int32_t *) dst->op_params + 7, sizeof(bool));
|
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
|
||||||
|
|
||||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
|
||||||
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
|
||||||
|
|
||||||
assert(nb0 == sizeof(float));
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
|
||||||
const int nth = params->nth;
|
|
||||||
|
|
||||||
const int nr = ggml_nrows(dst);
|
|
||||||
|
|
||||||
// rows per thread
|
|
||||||
const int dr = (nr + nth - 1)/nth;
|
|
||||||
|
|
||||||
// row range for this thread
|
|
||||||
const int ir0 = dr*ith;
|
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
|
||||||
|
|
||||||
// row index used to determine which thread to use
|
|
||||||
int ir = 0;
|
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
|
||||||
const int64_t p = pos[i2];
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
||||||
if (ir++ < ir0) continue;
|
|
||||||
if (ir > ir1) break;
|
|
||||||
|
|
||||||
float theta_base = freq_scale * (float)p;
|
|
||||||
|
|
||||||
if (!is_neox) {
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
// zeta scaling for xPos only:
|
|
||||||
float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
|
|
||||||
if (xpos_down) zeta = 1.0f / zeta;
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = dy[0];
|
|
||||||
const float dy1 = dy[1];
|
|
||||||
|
|
||||||
dx[0] = dy0*cos_theta*zeta + dy1*sin_theta*zeta;
|
|
||||||
dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const float * const dy = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
float * dx = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = dy[0];
|
|
||||||
const float dy1 = dy[n_dims/2];
|
|
||||||
|
|
||||||
dx[0] = dy0*cos_theta + dy1*sin_theta;
|
|
||||||
dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_back_f16(
|
|
||||||
const struct ggml_compute_params * params,
|
|
||||||
const struct ggml_tensor * src0,
|
|
||||||
const struct ggml_tensor * src1,
|
|
||||||
struct ggml_tensor * dst) {
|
|
||||||
|
|
||||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// y = rope(x, src1)
|
|
||||||
// dx = rope_back(dy, src1)
|
|
||||||
// src0 is dy, src1 contains options
|
|
||||||
|
|
||||||
//const int n_past = ((int32_t *) dst->op_params)[0];
|
|
||||||
const int n_dims = ((int32_t *) dst->op_params)[1];
|
|
||||||
const int mode = ((int32_t *) dst->op_params)[2];
|
|
||||||
|
|
||||||
GGML_TENSOR_UNARY_OP_LOCALS
|
|
||||||
|
|
||||||
//printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
|
|
||||||
//printf("n_past = %d, ne2 = %d\n", n_past, ne2);
|
|
||||||
|
|
||||||
assert(nb0 == sizeof(ggml_fp16_t));
|
|
||||||
|
|
||||||
const int ith = params->ith;
|
|
||||||
const int nth = params->nth;
|
|
||||||
|
|
||||||
const int nr = ggml_nrows(dst);
|
|
||||||
|
|
||||||
// rows per thread
|
|
||||||
const int dr = (nr + nth - 1)/nth;
|
|
||||||
|
|
||||||
// row range for this thread
|
|
||||||
const int ir0 = dr*ith;
|
|
||||||
const int ir1 = MIN(ir0 + dr, nr);
|
|
||||||
|
|
||||||
// row index used to determine which thread to use
|
|
||||||
int ir = 0;
|
|
||||||
|
|
||||||
const float theta_scale = powf(10000.0, -2.0f/n_dims);
|
|
||||||
|
|
||||||
const bool is_neox = mode & 2;
|
|
||||||
|
|
||||||
const int32_t * pos = (const int32_t *) src1->data;
|
|
||||||
|
|
||||||
for (int64_t i3 = 0; i3 < ne3; i3++) {
|
|
||||||
for (int64_t i2 = 0; i2 < ne2; i2++) {
|
|
||||||
const int64_t p = pos[i2];
|
|
||||||
for (int64_t i1 = 0; i1 < ne1; i1++) {
|
|
||||||
if (ir++ < ir0) continue;
|
|
||||||
if (ir > ir1) break;
|
|
||||||
|
|
||||||
float theta_base = (float)p;
|
|
||||||
|
|
||||||
if (!is_neox) {
|
|
||||||
for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
|
||||||
const float dy1 = GGML_FP16_TO_FP32(dy[1]);
|
|
||||||
|
|
||||||
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
|
||||||
dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
|
||||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
|
||||||
const float cos_theta = cosf(theta_base);
|
|
||||||
const float sin_theta = sinf(theta_base);
|
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const ggml_fp16_t * const dy = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
|
||||||
ggml_fp16_t * dx = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
|
||||||
|
|
||||||
const float dy0 = GGML_FP16_TO_FP32(dy[0]);
|
|
||||||
const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
|
|
||||||
|
|
||||||
dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
|
|
||||||
dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_compute_forward_rope_back(
|
static void ggml_compute_forward_rope_back(
|
||||||
const struct ggml_compute_params * params,
|
const struct ggml_compute_params * params,
|
||||||
const struct ggml_tensor * src0,
|
const struct ggml_tensor * src0,
|
||||||
@ -11484,11 +11298,11 @@ static void ggml_compute_forward_rope_back(
|
|||||||
switch (src0->type) {
|
switch (src0->type) {
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
|
ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
|
||||||
} break;
|
} break;
|
||||||
case GGML_TYPE_F32:
|
case GGML_TYPE_F32:
|
||||||
{
|
{
|
||||||
ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
|
ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
@ -14923,17 +14737,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
// necessary for llama
|
// necessary for llama
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||||
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
||||||
float freq_base;
|
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
||||||
float freq_scale;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
||||||
float xpos_base;
|
|
||||||
bool xpos_down;
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
||||||
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
||||||
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
||||||
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
||||||
|
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
||||||
|
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
||||||
|
|
||||||
src0->grad = ggml_add_or_set(ctx,
|
src0->grad = ggml_add_or_set(ctx,
|
||||||
src0->grad,
|
src0->grad,
|
||||||
@ -14943,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
n_dims,
|
n_dims,
|
||||||
mode,
|
mode,
|
||||||
n_ctx,
|
n_ctx,
|
||||||
|
n_orig_ctx,
|
||||||
freq_base,
|
freq_base,
|
||||||
freq_scale,
|
freq_scale,
|
||||||
|
ext_factor,
|
||||||
|
attn_factor,
|
||||||
|
beta_fast,
|
||||||
|
beta_slow,
|
||||||
xpos_base,
|
xpos_base,
|
||||||
xpos_down),
|
xpos_down),
|
||||||
zero_table);
|
zero_table);
|
||||||
@ -14954,17 +14776,20 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
{
|
{
|
||||||
if (src0->grad) {
|
if (src0->grad) {
|
||||||
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
//const int n_past = ((int32_t *) tensor->op_params)[0];
|
||||||
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
const int n_dims = ((int32_t *) tensor->op_params)[1];
|
||||||
const int mode = ((int32_t *) tensor->op_params)[2];
|
const int mode = ((int32_t *) tensor->op_params)[2];
|
||||||
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
const int n_ctx = ((int32_t *) tensor->op_params)[3];
|
||||||
float freq_base;
|
const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
|
||||||
float freq_scale;
|
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
|
||||||
float xpos_base;
|
|
||||||
bool xpos_down;
|
memcpy(&freq_base, (int32_t *) tensor->op_params + 5, sizeof(float));
|
||||||
memcpy(&freq_base, (int32_t *) tensor->op_params + 4, sizeof(float));
|
memcpy(&freq_scale, (int32_t *) tensor->op_params + 6, sizeof(float));
|
||||||
memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
|
memcpy(&ext_factor, (int32_t *) tensor->op_params + 7, sizeof(float));
|
||||||
memcpy(&xpos_base, (int32_t *) tensor->op_params + 6, sizeof(float));
|
memcpy(&attn_factor, (int32_t *) tensor->op_params + 8, sizeof(float));
|
||||||
memcpy(&xpos_down, (int32_t *) tensor->op_params + 7, sizeof(bool));
|
memcpy(&beta_fast, (int32_t *) tensor->op_params + 9, sizeof(float));
|
||||||
|
memcpy(&beta_slow, (int32_t *) tensor->op_params + 10, sizeof(float));
|
||||||
|
memcpy(&xpos_base, (int32_t *) tensor->op_params + 11, sizeof(float));
|
||||||
|
memcpy(&xpos_down, (int32_t *) tensor->op_params + 12, sizeof(bool));
|
||||||
|
|
||||||
src0->grad = ggml_add_or_set(ctx,
|
src0->grad = ggml_add_or_set(ctx,
|
||||||
src0->grad,
|
src0->grad,
|
||||||
@ -14973,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||||||
src1,
|
src1,
|
||||||
n_dims,
|
n_dims,
|
||||||
mode,
|
mode,
|
||||||
0,
|
|
||||||
n_ctx,
|
n_ctx,
|
||||||
|
n_orig_ctx,
|
||||||
freq_base,
|
freq_base,
|
||||||
freq_scale,
|
freq_scale,
|
||||||
0.0f,
|
ext_factor,
|
||||||
1.0f,
|
attn_factor,
|
||||||
0.0f,
|
beta_fast,
|
||||||
0.0f,
|
beta_slow,
|
||||||
xpos_base,
|
xpos_base,
|
||||||
xpos_down,
|
xpos_down,
|
||||||
false),
|
false),
|
||||||
@ -18248,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
{
|
{
|
||||||
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
|
||||||
struct gguf_kv * kv = &ctx->kv[i];
|
struct gguf_kv * kv = &ctx->kv[i];
|
||||||
|
|
||||||
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
//fprintf(stderr, "%s: reading kv %d\n", __func__, i);
|
||||||
@ -18295,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
case GGUF_TYPE_STRING:
|
case GGUF_TYPE_STRING:
|
||||||
{
|
{
|
||||||
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
|
||||||
for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
|
for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
|
||||||
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -18323,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
{
|
{
|
||||||
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));
|
||||||
|
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
for (int j = 0; j < GGML_MAX_DIMS; ++j) {
|
||||||
@ -18370,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
// compute the total size of the data section, taking into account the alignment
|
// compute the total size of the data section, taking into account the alignment
|
||||||
{
|
{
|
||||||
ctx->size = 0;
|
ctx->size = 0;
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
struct gguf_tensor_info * info = &ctx->infos[i];
|
struct gguf_tensor_info * info = &ctx->infos[i];
|
||||||
|
|
||||||
const int64_t ne =
|
const int64_t ne =
|
||||||
@ -18439,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|||||||
ggml_set_no_alloc(ctx_data, true);
|
ggml_set_no_alloc(ctx_data, true);
|
||||||
|
|
||||||
// create the tensors
|
// create the tensors
|
||||||
for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
|
for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
|
||||||
const int64_t ne[GGML_MAX_DIMS] = {
|
const int64_t ne[GGML_MAX_DIMS] = {
|
||||||
ctx->infos[i].ne[0],
|
ctx->infos[i].ne[0],
|
||||||
ctx->infos[i].ne[1],
|
ctx->infos[i].ne[1],
|
||||||
|
5
ggml.h
5
ggml.h
@ -1371,8 +1371,13 @@ extern "C" {
|
|||||||
int n_dims,
|
int n_dims,
|
||||||
int mode,
|
int mode,
|
||||||
int n_ctx,
|
int n_ctx,
|
||||||
|
int n_orig_ctx,
|
||||||
float freq_base,
|
float freq_base,
|
||||||
float freq_scale,
|
float freq_scale,
|
||||||
|
float ext_factor,
|
||||||
|
float attn_factor,
|
||||||
|
float beta_fast,
|
||||||
|
float beta_slow,
|
||||||
float xpos_base,
|
float xpos_base,
|
||||||
bool xpos_down);
|
bool xpos_down);
|
||||||
|
|
||||||
|
@ -39,19 +39,19 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
|
|||||||
|
|
||||||
## Available models
|
## Available models
|
||||||
|
|
||||||
| Model | Disk | Mem | SHA |
|
| Model | Disk | SHA |
|
||||||
| --- | --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| tiny | 75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
|
| tiny | 75 MiB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
|
||||||
| tiny.en | 75 MB | ~390 MB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
|
| tiny.en | 75 MiB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
|
||||||
| base | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
| base | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
|
||||||
| base.en | 142 MB | ~500 MB | `137c40403d78fd54d454da0f9bd998f78703390c` |
|
| base.en | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
|
||||||
| small | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
| small | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
|
||||||
| small.en | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
|
| small.en | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
|
||||||
| medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
| medium | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
|
||||||
| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
|
| medium.en | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
|
||||||
| large-v1 | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
|
| large-v1 | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
|
||||||
| large-v2 | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
| large-v2 | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
|
||||||
| large | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
| large-v3 | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
|
||||||
|
|
||||||
## Model files for testing purposes
|
## Model files for testing purposes
|
||||||
|
|
||||||
@ -76,3 +76,27 @@ git clone https://huggingface.co/openai/whisper-medium
|
|||||||
# convert the model to ggml
|
# convert the model to ggml
|
||||||
python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
|
python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Distilled models
|
||||||
|
|
||||||
|
Initial support for https://huggingface.co/distil-whisper is available.
|
||||||
|
|
||||||
|
Currently, the chunk-based transcription strategy is not implemented, so there can be sub-optimal quality when using the distilled models with `whisper.cpp`.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# clone OpenAI whisper and whisper.cpp
|
||||||
|
git clone https://github.com/openai/whisper
|
||||||
|
git clone https://github.com/ggerganov/whisper.cpp
|
||||||
|
|
||||||
|
# get the models
|
||||||
|
cd whisper.cpp/models
|
||||||
|
git clone https://huggingface.co/distil-whisper/distil-medium.en
|
||||||
|
git clone https://huggingface.co/distil-whisper/distil-large-v2
|
||||||
|
|
||||||
|
# convert to ggml
|
||||||
|
python3 ./convert-h5-to-ggml.py ./distil-medium.en/ ../../whisper .
|
||||||
|
mv ggml-model.bin ggml-medium.en-distil.bin
|
||||||
|
|
||||||
|
python3 ./convert-h5-to-ggml.py ./distil-large-v2/ ../../whisper .
|
||||||
|
mv ggml-model.bin ggml-large-v2-distil.bin
|
||||||
|
```
|
||||||
|
@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
|
|||||||
# Ported from models/convert-whisper-to-coreml.py
|
# Ported from models/convert-whisper-to-coreml.py
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
|
||||||
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
|
parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
|
||||||
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
||||||
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
||||||
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
|
||||||
raise ValueError("Invalid model name")
|
raise ValueError("Invalid model name")
|
||||||
|
|
||||||
pt_target_path = f"models/hf-{args.model_name}.pt"
|
pt_target_path = f"models/hf-{args.model_name}.pt"
|
||||||
|
@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
|
||||||
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
|
||||||
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
parser.add_argument("--quantize", type=bool, help="quantize weights to F16", default=False)
|
||||||
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
|
||||||
raise ValueError("Invalid model name")
|
raise ValueError("Invalid model name")
|
||||||
|
|
||||||
whisper = load_model(args.model).cpu()
|
whisper = load_model(args.model).cpu()
|
||||||
|
@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
|
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
|
if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
|
||||||
raise ValueError("Invalid model name")
|
raise ValueError("Invalid model name")
|
||||||
|
|
||||||
whisper = load_model(args.model).cpu()
|
whisper = load_model(args.model).cpu()
|
||||||
|
@ -19,7 +19,7 @@ function get_script_path() {
|
|||||||
models_path="$(get_script_path)"
|
models_path="$(get_script_path)"
|
||||||
|
|
||||||
# Whisper models
|
# Whisper models
|
||||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||||
|
|
||||||
# list available models
|
# list available models
|
||||||
function list_models {
|
function list_models {
|
||||||
|
@ -8,7 +8,7 @@ popd
|
|||||||
set argc=0
|
set argc=0
|
||||||
for %%x in (%*) do set /A argc+=1
|
for %%x in (%*) do set /A argc+=1
|
||||||
|
|
||||||
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
|
set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3
|
||||||
|
|
||||||
if %argc% neq 1 (
|
if %argc% neq 1 (
|
||||||
echo.
|
echo.
|
||||||
|
@ -42,7 +42,7 @@ models=(
|
|||||||
"medium.en-q5_0"
|
"medium.en-q5_0"
|
||||||
"large-v1"
|
"large-v1"
|
||||||
"large-v2"
|
"large-v2"
|
||||||
"large"
|
"large-v3"
|
||||||
"large-q5_0"
|
"large-q5_0"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@
|
|||||||
cd `dirname $0`
|
cd `dirname $0`
|
||||||
|
|
||||||
# Whisper models
|
# Whisper models
|
||||||
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
|
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
|
||||||
|
|
||||||
# list available models
|
# list available models
|
||||||
function list_models {
|
function list_models {
|
||||||
|
1221
whisper.cpp
1221
whisper.cpp
File diff suppressed because it is too large
Load Diff
@ -50,7 +50,9 @@ extern "C" {
|
|||||||
//
|
//
|
||||||
// ...
|
// ...
|
||||||
//
|
//
|
||||||
// struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
|
// whisper_context_params cparams = whisper_context_default_params();
|
||||||
|
//
|
||||||
|
// struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
|
||||||
//
|
//
|
||||||
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||||
// fprintf(stderr, "failed to process audio\n");
|
// fprintf(stderr, "failed to process audio\n");
|
||||||
@ -78,7 +80,9 @@ extern "C" {
|
|||||||
struct whisper_state;
|
struct whisper_state;
|
||||||
struct whisper_full_params;
|
struct whisper_full_params;
|
||||||
|
|
||||||
typedef int whisper_token;
|
typedef int32_t whisper_pos;
|
||||||
|
typedef int32_t whisper_token;
|
||||||
|
typedef int32_t whisper_seq_id;
|
||||||
|
|
||||||
struct whisper_context_params {
|
struct whisper_context_params {
|
||||||
bool use_gpu;
|
bool use_gpu;
|
||||||
|
Reference in New Issue
Block a user