Compare commits

..

1 Commits

Author SHA1 Message Date
c6174cb868 wip 2023-05-02 21:47:12 +03:00
26 changed files with 2389 additions and 5269 deletions

View File

@ -236,61 +236,6 @@ jobs:
name: whisper-blas-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
windows-cublas:
runs-on: windows-latest
strategy:
matrix:
build: [Release]
arch: [x64]
cublas: [ON]
sdl2: [ON]
include:
- arch: x64
s2arc: x64
- sdl2: ON
s2ver: 2.26.0
steps:
- name: Clone
uses: actions/checkout@v1
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1
- name: Install CUDA Toolkit
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.10
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_CUBLAS=1
- name: Build
run: |
cd ./build
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
- name: Upload binaries
if: matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v1
with:
name: whisper-cublas-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
emscripten:
runs-on: ubuntu-latest
@ -333,9 +278,7 @@ jobs:
uses: actions/checkout@v1
- name: Configure
run: |
cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
mkdir models/ggml-base.en-encoder.mlmodelc
run: cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
- name: Build objc example
run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build

View File

@ -1,6 +1,10 @@
cmake_minimum_required (VERSION 3.0)
project(whisper.cpp VERSION 1.4.2)
project(whisper.cpp VERSION 1.4.1)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
add_compile_options(/utf-8)
endif ()
# Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -49,19 +53,17 @@ option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDA
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_NO_F16C "whisper: disable F16c" OFF)
if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
else()
option(WHISPER_OPENBLAS "whisper: support for OpenBLAS" OFF)
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
endif()
option(WHISPER_PERF "whisper: enable perf timings" OFF)
@ -121,7 +123,7 @@ if (APPLE)
endif()
if (WHISPER_COREML_ALLOW_FALLBACK)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_COREML_ALLOW_FALLBACK)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML_ALLOW_FALLBACK)
endif()
endif()
endif()
@ -165,21 +167,6 @@ if (WHISPER_CUBLAS)
endif()
endif()
if (WHISPER_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
add_compile_definitions(GGML_USE_CLBLAST)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
endif()
# compiler flags
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@ -222,17 +209,9 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
else()
message(STATUS "x86 detected")
if (MSVC)
if(NOT WHISPER_NO_AVX2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
else()
if(NOT WHISPER_NO_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
endif()
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
else()
if (EMSCRIPTEN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
@ -295,7 +274,6 @@ add_library(${TARGET}
ggml.h
ggml.c
${GGML_CUDA_SOURCES}
${GGML_OPENCL_SOURCES}
whisper.h
whisper.cpp
)

View File

@ -171,22 +171,13 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif
ifdef WHISPER_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
LDFLAGS += -lclblast -lOpenCL
WHISPER_OBJ += ggml-opencl.o
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
$(CC) $(CFLAGS) -c $< -o $@
endif
ifdef WHISPER_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
endif
ifneq ($(filter aarch64%,$(UNAME_M)),)
CFLAGS += -mcpu=native
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif
@ -197,18 +188,15 @@ endif
ifneq ($(filter armv7%,$(UNAME_M)),)
# 32-bit ARM, for example on Armbian or possibly raspbian
#CFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
#CXXFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
# 64-bit ARM on 32-bit OS, use these (TODO: auto-detect 64-bit)
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
# 64-bit ARM, use these (TODO: auto-detect 64-bit)
# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif
ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4
CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
CFLAGS += -mfp16-format=ieee -mno-unaligned-access
endif
#
@ -240,7 +228,7 @@ ifndef WHISPER_COREML
WHISPER_OBJ += whisper.o
else
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder.mm -o whisper-encoder.o
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o

View File

@ -6,7 +6,7 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Beta: [v1.4.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.2) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Beta: [v1.4.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.1) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -20,7 +20,6 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
- Zero memory allocations at runtime
- Runs on the CPU
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
Supported platforms:
@ -71,8 +70,6 @@ Then, download one of the Whisper models converted in [ggml format](models). For
bash ./models/download-ggml-model.sh base.en
```
If you wish to convert the Whisper models to ggml format yourself, instructions are in [models/README.md](models/README.md).
Now build the [main](examples/main) example and transcribe an audio file like this:
```bash
@ -261,12 +258,6 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
pip install coremltools
```
- To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
- Python 3.10 is recommended.
- [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
- To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
- To activate the environment, use: `conda activate py310-whisper`
- Generate a Core ML model. For example, to generate a `base.en` model, use:
```bash
@ -320,29 +311,6 @@ make clean
WHISPER_CUBLAS=1 make -j
```
## OpenCL GPU support via CLBlast
For cards and integrated GPUs that support OpenCL, the Encoder processing can be largely offloaded to the GPU through CLBlast. This is especially useful for users with AMD APU's or low end devices for up to ~2x speedup.
First, make sure you have installed `CLBlast` for your OS or Distribution: https://github.com/CNugteren/CLBlast
Now build `whisper.cpp` with CLBlast support:
```
Makefile:
cd whisper.cpp
make clean
WHISPER_CLBLAST=1 make -j
CMake:
cd whisper.cpp ; mkdir build ; cd build
cmake -DWHISPER_CLBLAST=ON ..
make clean
make -j
cp bin/* ../
```
Run all the examples as usual.
## Limitations

View File

@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.4.2",
"version": "1.4.1",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {

View File

@ -1,9 +1,5 @@
#if !__has_feature(objc_arc)
#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
#endif
#import "whisper-encoder.h"
#import "whisper-encoder-impl.h"
#import "coreml/whisper-encoder.h"
#import "coreml/whisper-encoder-impl.h"
#import <CoreML/CoreML.h>
@ -55,7 +51,15 @@ void whisper_coreml_encode(
whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
MLMultiArray * outMA = outCoreML.output;
//NSArray<NSNumber *> * shape = outMA.shape;
//NSArray<NSNumber *> * strides = outMA.strides;
//printf("shape: %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]);
//printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]);
memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
}
#if __cplusplus

View File

@ -6,6 +6,7 @@
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
@ -45,6 +46,7 @@ bool ggml_common_quantize_0(
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
@ -169,6 +171,10 @@ bool ggml_common_quantize_0(
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());

View File

@ -38,20 +38,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, params);
exit(0);
} else if (arg == "-f" || arg == "--file") {
if (++i > argc) {
fprintf(stderr, "Invalid file param");
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
break;
}
std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
if (params.prompt.back() == '\n') {
params.prompt.pop_back();
}
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
@ -71,8 +57,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: random)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " load prompt from a file\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
@ -208,10 +192,6 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
return result;
}
void gpt_vocab::add_special_token(const std::string & token) {
special_tokens.push_back(token);
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
@ -220,20 +200,6 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
std::string str = text;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
// Generate the subpattern from the special_tokens vector if it's not empty
if (!vocab.special_tokens.empty()) {
std::string special_tokens_subpattern;
for (const auto & token : vocab.special_tokens) {
if (!special_tokens_subpattern.empty()) {
special_tokens_subpattern += "|";
}
special_tokens_subpattern += token;
}
// Modify the regex pattern with the generated special tokens subpattern
pat = special_tokens_subpattern + "|" + pat;
}
std::regex re(pat);
std::smatch m;

View File

@ -53,9 +53,6 @@ struct gpt_vocab {
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
std::vector<std::string> special_tokens;
void add_special_token(const std::string & token);
};
// poor-man's JSON parsing

View File

@ -191,7 +191,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "true" : "false");
fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str());
fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false");
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());

View File

@ -25,7 +25,7 @@ struct whisper_hparams {
int32_t n_text_head = 6;
int32_t n_text_layer = 4;
int32_t n_mels = 80;
int32_t ftype = 1;
int32_t f16 = 1;
};
struct whisper_filters {
@ -79,10 +79,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
@ -94,10 +91,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype);
fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src);
fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
@ -109,7 +103,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
fout.write((char *) &ftype_dst, sizeof(hparams.ftype));
fout.write((char *) &ftype, sizeof(hparams.f16));
}
// load mel filters

File diff suppressed because it is too large Load Diff

View File

@ -19,11 +19,11 @@
# define LLAMA_API
#endif
#define LLAMA_FILE_VERSION 2
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 'ggjt'
#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
#define LLAMA_SESSION_MAGIC 'ggsn'
#define LLAMA_SESSION_VERSION 1
#define LLAMA_SESSION_VERSION 0
#ifdef __cplusplus
extern "C" {
@ -54,10 +54,9 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params {
int n_ctx; // text context
int n_parts; // -1 for default
int n_gpu_layers; // number of layers to store in VRAM
int seed; // RNG seed, -1 for random
int n_ctx; // text context
int n_parts; // -1 for default
int seed; // RNG seed, 0 for random
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
@ -79,7 +78,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
@ -123,19 +122,18 @@ extern "C" {
int n_threads);
// Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens
LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
// Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
// Copies the state to the specified destination address.
// Destination needs to have allocated enough memory.
// Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
// Set the state reading from the specified address
// Returns the number of bytes read
@ -144,7 +142,6 @@ extern "C" {
// Save/load session file
LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
@ -168,9 +165,9 @@ extern "C" {
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
@ -184,7 +181,7 @@ extern "C" {
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos();
@ -194,25 +191,25 @@ extern "C" {
// Sampling functions
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.

View File

@ -609,8 +609,8 @@ int main(int argc, char ** argv) {
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_top_k(ctx_llama, &candidates_p, top_k);
llama_sample_top_p(ctx_llama, &candidates_p, top_p);
llama_sample_temperature(ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}

View File

@ -14,24 +14,15 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b
```java
git clone https://github.com/ggerganov/whisper.cpp
open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/
// If you don't want to convert a Core ML model, you can skip this step by create dummy model
mkdir models/ggml-base.en-encoder.mlmodelc
```
Make sure to build the project in `Release`:
<img width="947" alt="image" src="https://user-images.githubusercontent.com/1991296/197382607-9e1e6d1b-79fa-496f-9d16-b71dc1535701.png">
Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag for `ggml.c` in Build Phases.
Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Phases.
This can significantly improve the performance of the transcription:
<img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:
<img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">
Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.
In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.

View File

@ -14,13 +14,9 @@
18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK"; }; };
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; };
18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE"; }; };
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */ = {isa = PBXBuildFile; fileRef = 7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */; };
/* End PBXBuildFile section */
/* Begin PBXFileReference section */
@ -41,13 +37,6 @@
18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml.c; sourceTree = "<group>"; };
18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml.h; sourceTree = "<group>"; };
18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = "ggml-base.en.bin"; path = "../../../models/ggml-base.en.bin"; sourceTree = "<group>"; };
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
7FE342482A0C3FA20015A058 /* whisper-decoder-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-decoder-impl.h"; sourceTree = "<group>"; };
7FE342492A0C3FA20015A058 /* whisper-encoder-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder-impl.h"; sourceTree = "<group>"; };
7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-decoder-impl.m"; sourceTree = "<group>"; };
7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = "ggml-base.en-encoder.mlmodelc"; path = "../../../models/ggml-base.en-encoder.mlmodelc"; sourceTree = "<group>"; };
/* End PBXFileReference section */
/* Begin PBXFrameworksBuildPhase section */
@ -80,8 +69,6 @@
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
isa = PBXGroup;
children = (
7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */,
7FE342442A0C3FA20015A058 /* coreml */,
18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */,
18627C9729052C6600BD2A04 /* ggml.h */,
18627C9529052C5800BD2A04 /* ggml.c */,
@ -102,20 +89,6 @@
path = whisper.objc;
sourceTree = "<group>";
};
7FE342442A0C3FA20015A058 /* coreml */ = {
isa = PBXGroup;
children = (
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */,
7FE342462A0C3FA20015A058 /* whisper-encoder.h */,
7FE342472A0C3FA20015A058 /* whisper-encoder.mm */,
7FE342482A0C3FA20015A058 /* whisper-decoder-impl.h */,
7FE342492A0C3FA20015A058 /* whisper-encoder-impl.h */,
7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */,
);
name = coreml;
path = ../../../coreml;
sourceTree = "<group>";
};
/* End PBXGroup section */
/* Begin PBXNativeTarget section */
@ -174,7 +147,6 @@
buildActionMask = 2147483647;
files = (
18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */,
7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */,
18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */,
18627C8429052BDF00BD2A04 /* Main.storyboard in Resources */,
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */,
@ -189,14 +161,11 @@
buildActionMask = 2147483647;
files = (
18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
18627C9629052C5800BD2A04 /* ggml.c in Sources */,
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
18627C8C29052BE000BD2A04 /* main.m in Sources */,
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};

View File

@ -10,9 +10,3 @@ cp -rpv ../ggml/examples/common.h ./examples/common.h
cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
cp -rpv ../ggml/examples/common-ggml.h ./examples/common-ggml.h
cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp
cp -rpv ../ggml/examples/whisper/whisper.h ./whisper.h
cp -rpv ../ggml/examples/whisper/whisper.cpp ./whisper.cpp
cp -rpv ../ggml/examples/whisper/main.cpp ./examples/main/main.cpp
cp -rpv ../ggml/examples/whisper/quantize.cpp ./examples/quantize/quantize.cpp

View File

@ -32,15 +32,9 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
} \
} while (0)
typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
// QK = number of values after dequantization
// QR = QK / number of values before dequantization
#define QK4_0 32
#define QR4_0 2
typedef struct {
float d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
@ -48,7 +42,6 @@ typedef struct {
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
#define QK4_1 32
#define QR4_1 2
typedef struct {
float d; // delta
float m; // min
@ -56,8 +49,14 @@ typedef struct {
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK4_2 16
typedef struct {
half d; // delta
uint8_t qs[QK4_2 / 2]; // nibbles / quants
} block_q4_2;
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
#define QK5_0 32
#define QR5_0 2
typedef struct {
half d; // delta
uint8_t qh[4]; // 5-th bit of quants
@ -66,7 +65,6 @@ typedef struct {
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
#define QK5_1 32
#define QR5_1 2
typedef struct {
half d; // delta
half m; // min
@ -76,121 +74,36 @@ typedef struct {
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
#define QK8_0 32
#define QR8_0 1
typedef struct {
float d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
#define CUDA_DMMV_BLOCK_SIZE 32
static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q4_0 * x = (const block_q4_0 *) vx;
const float d = x[ib].d;
const uint8_t vui = x[ib].qs[iqs];
const int8_t vi0 = vui & 0xF;
const int8_t vi1 = vui >> 4;
v0 = (vi0 - 8)*d;
v1 = (vi1 - 8)*d;
}
static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q4_1 * x = (const block_q4_1 *) vx;
const float d = x[ib].d;
const float m = x[ib].m;
const uint8_t vui = x[ib].qs[iqs];
const int8_t vi0 = vui & 0xF;
const int8_t vi1 = vui >> 4;
v0 = vi0*d + m;
v1 = vi1*d + m;
}
static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q5_0 * x = (const block_q5_0 *) vx;
const float d = x[ib].d;
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1) - 16;
v0 = x0*d;
v1 = x1*d;
}
static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q5_1 * x = (const block_q5_1 *) vx;
const float d = x[ib].d;
const float m = x[ib].m;
uint32_t qh;
memcpy(&qh, x[ib].qh, sizeof(qh));
const uint8_t xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (iqs + 12)) ) & 0x10;
const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
const int32_t x1 = ((x[ib].qs[iqs] >> 4) | xh_1);
v0 = x0*d + m;
v1 = x1*d + m;
}
static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const block_q8_0 * x = (const block_q8_0 *) vx;
const float d = x[ib].d;
const int8_t vi0 = x[ib].qs[iqs + 0];
const int8_t vi1 = x[ib].qs[iqs + 1];
v0 = vi0*d;
v1 = vi1*d;
}
static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
const half * x = (const half *) vx;
v0 = __half2float(x[ib + 0]);
v1 = __half2float(x[ib + 1]);
}
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
static const int qk = QK4_0;
const block_q4_0 * x = (const block_q4_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
for (int j = 0; j < qk/2; ++j) {
const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;
const uint8_t * pp = x[i].qs;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
for (int l = 0; l < QK4_0; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*QK4_0 + l + 0] = v0;
y[i*QK4_0 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
static const int qk = QK4_1;
const block_q4_1 * x = (const block_q4_1 *) vx;
const int i = blockIdx.x;
@ -198,42 +111,75 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
const float d = x[i].d;
const float m = x[i].m;
for (int j = 0; j < qk/2; ++j) {
const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);
const uint8_t * pp = x[i].qs;
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
for (int l = 0; l < QK4_1; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*QK4_1 + l + 0] = v0;
y[i*QK4_1 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
const block_q4_2 * x = (const block_q4_2 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const uint8_t * pp = x[i].qs;
for (int l = 0; l < QK4_2; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*QK4_2 + l + 0] = v0;
y[i*QK4_2 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
static const int qk = QK5_0;
const block_q5_0 * x = (const block_q5_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const uint8_t * pp = x[i].qs;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
for (int l = 0; l < QK5_0; l += 2) {
const uint8_t vi = pp[l/2];
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
const int8_t vi0 = ((vi & 0xf) | vh0);
const int8_t vi1 = ((vi >> 4) | vh1);
const float v0 = (vi0 - 16)*d;
const float v1 = (vi1 - 16)*d;
y[i*QK5_0 + l + 0] = v0;
y[i*QK5_0 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
static const int qk = QK5_1;
const block_q5_1 * x = (const block_q5_1 *) vx;
const int i = blockIdx.x;
@ -241,70 +187,41 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
const float d = x[i].d;
const float m = x[i].m;
const uint8_t * pp = x[i].qs;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
for (int j = 0; j < qk/2; ++j) {
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
for (int l = 0; l < QK5_1; l += 2) {
const uint8_t vi = pp[l/2];
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
const int8_t vi0 = (vi & 0xf) | vh0;
const int8_t vi1 = (vi >> 4) | vh1;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*QK5_1 + l + 0] = v0;
y[i*QK5_1 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
static const int qk = QK8_0;
const block_q8_0 * x = (const block_q8_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
for (int j = 0; j < qk; ++j) {
y[i*qk + j] = x[i].qs[j]*d;
}
}
const int8_t * pp = x[i].qs;
template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
const int row = blockIdx.x;
const int tid = threadIdx.x;
for (int l = 0; l < QK8_0; l++) {
const int8_t vi = pp[l];
const int y_offset = qr == 1 ? 1 : qk/2;
__shared__ float tmp[block_size]; // separate sum for each thread
tmp[tid] = 0;
for (int i = 0; i < ncols/block_size; i += 2) {
const int col = i*block_size + 2*tid;
const int ib = (row*ncols + col)/qk; // block index
const int iqs = (col%qk)/qr; // quant index
const int iybs = col - col%qk; // y block start index
// dequantize
float v0, v1;
dequantize_kernel(vx, ib, iqs, v0, v1);
// matrix multiplication
tmp[tid] += v0 * y[iybs + iqs + 0];
tmp[tid] += v1 * y[iybs + iqs + y_offset];
}
// sum up partial sums and write back result
__syncthreads();
for (int s=block_size/2; s>0; s>>=1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
__syncthreads();
}
if (tid == 0) {
dst[row] = tmp[0];
y[i*QK8_0 + l] = vi*d;
}
}
@ -318,6 +235,11 @@ static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStre
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
}
static void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_2;
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
}
static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_0;
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
@ -333,36 +255,6 @@ static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStre
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
}
static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
}
static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
}
// TODO: optimize
static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
const half * x = (const half *) vx;
@ -376,18 +268,14 @@ static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStre
convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
}
static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
<<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
}
static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return dequantize_row_q4_0_cuda;
case GGML_TYPE_Q4_1:
return dequantize_row_q4_1_cuda;
case GGML_TYPE_Q4_2:
return dequantize_row_q4_2_cuda;
case GGML_TYPE_Q5_0:
return dequantize_row_q5_0_cuda;
case GGML_TYPE_Q5_1:
@ -401,27 +289,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
}
}
static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return dequantize_mul_mat_vec_q4_0_cuda;
case GGML_TYPE_Q4_1:
return dequantize_mul_mat_vec_q4_1_cuda;
case GGML_TYPE_Q5_0:
return dequantize_mul_mat_vec_q5_0_cuda;
case GGML_TYPE_Q5_1:
return dequantize_mul_mat_vec_q5_1_cuda;
case GGML_TYPE_Q8_0:
return dequantize_mul_mat_vec_q8_0_cuda;
case GGML_TYPE_F16:
return convert_mul_mat_vec_f16_cuda;
default:
return nullptr;
}
}
// buffer pool for cuda
#define MAX_CUDA_BUFFERS 256
#define MAX_CUDA_BUFFERS 16
struct scoped_spin_lock {
std::atomic_flag& lock;
@ -479,7 +348,7 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
CUDA_CHECK(cudaFree(ptr));
}
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
#define GGML_CUDA_MAX_STREAMS 8
#define GGML_CUDA_MAX_EVENTS 64
static cublasHandle_t g_cublasH = nullptr;
static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
@ -718,7 +587,6 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
const int nb2 = dst->nb[2];
const int nb3 = dst->nb[3];
const ggml_type type = src0->type;
const bool mul_mat_vec = ne11 == 1;
const float alpha = 1.0f;
const float beta = 0.0f;
@ -729,16 +597,12 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
size_t x_size, y_size, d_size, q_size;
float * d_X = nullptr;
if (!mul_mat_vec) {
d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
}
float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
char * d_Q = (char *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
GGML_ASSERT(to_fp32_cuda != nullptr);
for (int64_t i03 = 0; i03 < ne03; i03++) {
@ -748,54 +612,31 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
cudaEvent_t cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
float * c_X = d_X + i * x_ne;
float * c_Y = d_Y + i * y_ne;
float * c_D = d_D + i * d_ne;
char * c_Q = d_Q + i * q_sz;
// copy src0 to device if necessary
if (src0->backend == GGML_BACKEND_CPU) {
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
} else if (src0->backend == GGML_BACKEND_CUDA) {
c_Q = ((char *) src0->data) + i * q_sz;
} else {
GGML_ASSERT(false);
}
if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
// copy src0 and convert to fp32 on device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
// copy src1 to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
// copy src1 to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
// wait for data
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
// wait for conversion
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
// compute
dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
CUDA_CHECK(cudaGetLastError());
} else { // general dequantization kernel + cuBLAS matrix matrix multiplication
float * c_X = d_X + i * x_ne;
// convert src0 to fp32 on device
to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
CUDA_CHECK(cudaGetLastError());
CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
// copy src1 to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
// wait for conversion
CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
// compute
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
CUBLAS_CHECK(
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, c_X, ne00,
c_Y, ne10,
&beta, c_D, ne01));
}
// compute
CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
CUBLAS_CHECK(
cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
ne01, ne11, ne10,
&alpha, c_X, ne00,
c_Y, ne10,
&beta, c_D, ne01));
// copy dst to host
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
@ -804,9 +645,7 @@ static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor
}
CUDA_CHECK(cudaDeviceSynchronize());
if (!mul_mat_vec) {
ggml_cuda_pool_free(d_X, x_size);
}
ggml_cuda_pool_free(d_X, x_size);
ggml_cuda_pool_free(d_Y, y_size);
ggml_cuda_pool_free(d_D, d_size);
ggml_cuda_pool_free(d_Q, q_size);
@ -822,7 +661,8 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
src1->type == GGML_TYPE_F32 &&
dst->type == GGML_TYPE_F32 &&
((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
return true;
}
@ -874,25 +714,3 @@ size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct
return 0;
}
}
void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
const int64_t ne0 = tensor->ne[0];
const int64_t ne1 = tensor->ne[1];
const int64_t ne2 = tensor->ne[2];
const int64_t ne3 = tensor->ne[3];
const ggml_type type = tensor->type;
const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
size_t q_size;
char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
cudaStream_t cudaStream2 = g_cudaStreams2[0];
// copy tensor to device
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
CUDA_CHECK(cudaDeviceSynchronize());
tensor->data = d_Q;
tensor->backend = GGML_BACKEND_CUDA;
}

View File

@ -14,8 +14,6 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
#ifdef __cplusplus
}
#endif

View File

@ -12,129 +12,129 @@
#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(
typedef uchar uint8_t;
typedef int int32_t;
typedef uint uint32_t;
constant uint QK4_0 = 32;
struct block_q4_0
{
float d;
uint8_t qs[QK4_0 / 2];
uchar qs[16];
};
constant uint QK4_1 = 32;
__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const uchar vi = blocks[i].qs[l];
const uint index = i*32 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}
struct block_q4_1
{
float d;
float m;
uint8_t qs[QK4_1 / 2];
uchar qs[16];
};
constant uint QK5_0 = 32;
struct __attribute__ ((packed)) block_q5_0
__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const float m = blocks[i].m;
const uchar vi = blocks[i].qs[l];
const uint index = i*32 + l*2;
result[index + 0] = (vi & 0xf) * d + m;
result[index + 1] = (vi >> 4) * d + m;
}
struct block_q4_2
{
half d;
uint32_t qh;
uint8_t qs[QK5_0 / 2];
ushort d;
uchar qs[8];
};
constant uint QK5_1 = 32;
__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
const uint i = get_global_id(0) / 16;
const uint l = get_local_id(0);
const float d = vload_half(0, (__global half*) &blocks[i].d);
const uchar vi = blocks[i].qs[l];
const uint index = i*16 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}
struct block_q5_0
{
float d;
uint qh;
uchar qs[16];
};
__kernel void dequantize_row_q5_0(__global struct block_q5_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = blocks[i].d;
const uchar vi = blocks[i].qs[l];
const uint l2 = l * 2;
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const uint index = i*32 + l2;
result[index + 0] = (((vi & 0xf) | vh0) - 16)*d;
result[index + 1] = (((vi >> 4) | vh1) - 16)*d;
}
struct block_q5_1
{
half d;
half m;
uint32_t qh;
uint8_t qs[QK5_1 / 2];
ushort d;
ushort m;
uint qh;
uchar qs[16];
};
constant uint QK8_0 = 32;
__kernel void dequantize_row_q5_1(__global struct block_q5_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
const float d = vload_half(0, (__global half*) &blocks[i].d);
const float m = vload_half(0, (__global half*) &blocks[i].m);
const uchar vi = blocks[i].qs[l];
const uint l2 = l * 2;
const uchar vh0 = ((blocks[i].qh & (1 << (l2 + 0))) >> (l2 + 0)) << 4;
const uchar vh1 = ((blocks[i].qh & (1 << (l2 + 1))) >> (l2 + 1)) << 4;
const uint index = i*32 + l2;
result[index + 0] = ((vi & 0xf) | vh0)*d + m;
result[index + 1] = ((vi >> 4) | vh1)*d + m;
}
struct block_q8_0
{
float d;
uint8_t qs[QK8_0];
char qs[32];
};
__kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);
__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
constant uint qk = QK4_0;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);
const float d = x[i].d;
const int x0 = (x[i].qs[j] & 0xf) - 8;
const int x1 = (x[i].qs[j] >> 4) - 8;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}
__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
constant uint qk = QK4_1;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);
const float d = x[i].d;
const float m = x[i].m;
const int x0 = (x[i].qs[j] & 0xf);
const int x1 = (x[i].qs[j] >> 4);
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}
__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
constant uint qk = QK5_0;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
uint32_t qh = x[i].qh;
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
const int32_t x1 = ((x[i].qs[j] >> 4) | xh_1) - 16;
y[i*qk + j + 0 ] = x0*d;
y[i*qk + j + qk/2] = x1*d;
}
__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
constant uint qk = QK5_1;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);
const float d = vload_half(0, (__global half*) &x[i].d);
const float m = vload_half(0, (__global half*) &x[i].m);
uint32_t qh = x[i].qh;
const uint8_t xh_0 = ((qh >> (j + 0)) << 4) & 0x10;
const uint8_t xh_1 = ((qh >> (j + 12)) ) & 0x10;
const int x0 = (x[i].qs[j] & 0xf) | xh_0;
const int x1 = (x[i].qs[j] >> 4) | xh_1;
y[i*qk + j + 0 ] = x0*d + m;
y[i*qk + j + qk/2] = x1*d + m;
}
__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
constant uint qk = QK8_0;
const uint i = get_global_id(0) / qk;
const uint j = get_local_id(0);
const float d = x[i].d;
y[i*qk + j] = x[i].qs[j]*d;
result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
}
);
@ -148,12 +148,26 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float*
} \
} while (0)
#define QK5_0 32
typedef struct {
ggml_fp16_t d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
typedef struct {
float d; // delta
uint32_t qh; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} cl_block_q5_0;
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_program program;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q5_0, kernel_q5_1, kernel_q8_0;
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
@ -224,6 +238,8 @@ void ggml_cl_init(void) {
CL_CHECK(err, "clCreateKernel");
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
@ -258,6 +274,7 @@ void ggml_cl_sgemm_wrapper(
cl_kernel kernel;
size_t global = n * k, local, size_qb;
bool dequant;
cl_block_q5_0* cl_host_b;
switch (btype) {
case GGML_TYPE_F32:
@ -275,11 +292,28 @@ void ggml_cl_sgemm_wrapper(
local = 16;
size_qb = global * (sizeof(float) * 2 + local) / 32;
break;
case GGML_TYPE_Q4_2:
dequant = true;
kernel = kernel_q4_2;
local = 8;
size_qb = global * (sizeof(ggml_fp16_t) + local) / 16;
break;
case GGML_TYPE_Q5_0:
dequant = true;
kernel = kernel_q5_0;
local = 16;
size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
// For some reason OpenCL seems to be incapable of working with structs of size 22.
// 20 and 24 bytes are fine. Workaround to do the fp16 to fp32 step on CPU...
// TODO Find the reason, fix and remove workaround.
const block_q5_0* b = (const block_q5_0*) host_b;
cl_host_b = (cl_block_q5_0*) malloc(sizeof(cl_block_q5_0) * global / 32);
for (size_t i = 0; i < global / 32; i++) {
cl_host_b[i].d = ggml_fp16_to_fp32(b[i].d);
memcpy(&cl_host_b[i].qh, b[i].qh, sizeof(uint32_t));
memcpy(&cl_host_b[i].qs, b[i].qs, QK5_0 / 2);
}
host_b = (const float*) cl_host_b;
size_qb = global * (sizeof(float) + sizeof(uint32_t) + local) / 32;
break;
case GGML_TYPE_Q5_1:
dequant = true;
@ -358,4 +392,7 @@ void ggml_cl_sgemm_wrapper(
clWaitForEvents(1, &ev_c);
clReleaseEvent(ev_sgemm);
clReleaseEvent(ev_c);
if (btype == GGML_TYPE_Q5_0) {
free((void*) cl_host_b);
}
}

5817
ggml.c

File diff suppressed because it is too large Load Diff

217
ggml.h
View File

@ -190,12 +190,9 @@
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
#define GGML_QNT_VERSION 1 // bump this on quantization format changes
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 256
#define GGML_MAX_PARAMS 16
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_OPT 4
#define GGML_DEFAULT_N_THREADS 4
@ -234,7 +231,7 @@ extern "C" {
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed
GGML_TYPE_Q4_2 = 4,
// GGML_TYPE_Q4_3 (5) support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
@ -246,11 +243,6 @@ extern "C" {
GGML_TYPE_COUNT,
};
enum ggml_backend {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_CUDA = 1,
};
// model file types
enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1,
@ -259,6 +251,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
@ -270,16 +263,12 @@ extern "C" {
GGML_OP_DUP,
GGML_OP_ADD,
GGML_OP_ADD1,
GGML_OP_ACC,
GGML_OP_SUB,
GGML_OP_MUL,
GGML_OP_DIV,
GGML_OP_SQR,
GGML_OP_SQRT,
GGML_OP_LOG,
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
GGML_OP_REPEAT,
GGML_OP_ABS,
@ -289,15 +278,12 @@ extern "C" {
GGML_OP_RELU,
GGML_OP_GELU,
GGML_OP_SILU,
GGML_OP_SILU_BACK,
GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM,
GGML_OP_RMS_NORM_BACK,
GGML_OP_MUL_MAT,
GGML_OP_SCALE,
GGML_OP_SET,
GGML_OP_CPY,
GGML_OP_CONT,
GGML_OP_RESHAPE,
@ -305,13 +291,9 @@ extern "C" {
GGML_OP_PERMUTE,
GGML_OP_TRANSPOSE,
GGML_OP_GET_ROWS,
GGML_OP_GET_ROWS_BACK,
GGML_OP_DIAG,
GGML_OP_DIAG_MASK_INF,
GGML_OP_DIAG_MASK_ZERO,
GGML_OP_SOFT_MAX,
GGML_OP_ROPE,
GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
GGML_OP_CONV_1D_1S,
GGML_OP_CONV_1D_2S,
@ -340,8 +322,7 @@ extern "C" {
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
enum ggml_backend backend;
enum ggml_type type;
int n_dims;
int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -372,7 +353,7 @@ extern "C" {
char name[32];
char padding[16];
char padding[8]; // TODO: remove and add padding to name?
};
// computation graph
@ -516,29 +497,6 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_add1(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_acc(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
GGML_API struct ggml_tensor * ggml_acc_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
GGML_API struct ggml_tensor * ggml_sub(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -562,24 +520,12 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_log(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_log_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return scalar
// TODO: compute sum along rows
GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx,
struct ggml_tensor * a);
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
GGML_API struct ggml_tensor * ggml_sum_rows(
struct ggml_context * ctx,
struct ggml_tensor * a);
// mean along rows
GGML_API struct ggml_tensor * ggml_mean(
struct ggml_context * ctx,
@ -621,13 +567,6 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// a - x
// b - dy
GGML_API struct ggml_tensor * ggml_silu_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// normalize along rows
// TODO: eps is hardcoded to 1e-5 for now
GGML_API struct ggml_tensor * ggml_norm(
@ -638,13 +577,6 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// a - x
// b - dy
GGML_API struct ggml_tensor * ggml_rms_norm_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows
@ -657,66 +589,12 @@ extern "C" {
// operations on tensors without backpropagation
//
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_scale(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_scale_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset);
GGML_API struct ggml_tensor * ggml_set_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset);
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset);
// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx,
@ -737,11 +615,6 @@ extern "C" {
// return view(a)
// TODO: when we start computing gradient, make a copy instead of view
GGML_API struct ggml_tensor * ggml_reshape_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0);
GGML_API struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -757,14 +630,6 @@ extern "C" {
int64_t ne1,
int64_t ne2);
GGML_API struct ggml_tensor * ggml_reshape_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3);
// offset in bytes
GGML_API struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx,
@ -790,18 +655,6 @@ extern "C" {
size_t nb2, // slice stride in bytes
size_t offset);
GGML_API struct ggml_tensor * ggml_view_4d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int64_t ne0,
int64_t ne1,
int64_t ne2,
int64_t ne3,
size_t nb1, // row stride in bytes
size_t nb2, // slice stride in bytes
size_t nb3,
size_t offset);
GGML_API struct ggml_tensor * ggml_permute(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -820,50 +673,20 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_get_rows_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);
GGML_API struct ggml_tensor * ggml_diag(
struct ggml_context * ctx,
struct ggml_tensor * a);
// set elements above the diagonal to -INF
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past);
// set elements above the diagonal to 0
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past);
// in-place, returns view(a)
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past);
GGML_API struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx,
struct ggml_tensor * a);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// rotary position embedding
// in-place, returns view(a)
// if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style
// TODO: avoid creating a new tensor every time
@ -874,23 +697,6 @@ extern "C" {
int n_dims,
int mode);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_rope_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode);
// rotary position embedding backward, i.e compute dx from dy
// a - dy
GGML_API struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_dims,
int mode);
// alibi position embedding
// in-place, returns view(a)
struct ggml_tensor * ggml_alibi(
@ -935,13 +741,13 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
ggml_unary_op_f32_t fun);
const ggml_unary_op_f32_t fun);
GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
ggml_binary_op_f32_t fun);
const ggml_binary_op_f32_t fun);
//
// automatic differentiation
@ -1070,6 +876,7 @@ extern "C" {
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);

View File

@ -1,17 +1,15 @@
## Whisper model files in custom ggml format
The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27)
are converted to custom `ggml` format in order to be able to load them in C/C++.
Conversion is performed using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script.
You can either obtain the original models and generate the `ggml` files yourself using the conversion script,
or you can use the [download-ggml-model.sh](download-ggml-model.sh) script to download the already converted models.
Currently, they are hosted on the following locations:
have been converted to custom `ggml` format in order to be able to load them in C/C++. The conversion has been performed
using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either obtain the original models and generate
the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
script to download the already converted models. Currently, they are hosted on the following locations:
- https://huggingface.co/ggerganov/whisper.cpp
- https://ggml.ggerganov.com
Sample download:
Sample usage:
```java
$ ./download-ggml-model.sh base.en
@ -23,16 +21,6 @@ You can now use it like this:
$ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
```
To convert the files yourself, use the convert-pt-to-ggml.py script. Here is an example usage.
The original PyTorch files are assumed to have been downloaded into ~/.cache/whisper
Change `~/path/to/repo/whisper/` to the location for your copy of the Whisper source:
```
mkdir models/whisper-medium
python models/convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
mv ./models/whisper-medium/ggml-model.bin models/ggml-medium.bin
rmdir models/whisper-medium
```
A third option to obtain the model files is to download them from Hugging Face:
https://huggingface.co/ggerganov/whisper.cpp/tree/main

View File

@ -62,7 +62,7 @@ if [ -f "ggml-$model.bin" ]; then
fi
if [ -x "$(command -v wget)" ]; then
wget --no-config --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
wget --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
elif [ -x "$(command -v curl)" ]; then
curl -L --output ggml-$model.bin $src/$pfx-$model.bin
else

View File

@ -291,6 +291,15 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
{ MODEL_LARGE, 1124ull*MB },
},
},
{ GGML_TYPE_Q4_2,
{
{ MODEL_TINY, 26ull*MB },
{ MODEL_BASE, 50ull*MB },
{ MODEL_SMALL, 154ull*MB },
{ MODEL_MEDIUM, 470ull*MB },
{ MODEL_LARGE, 940ull*MB },
},
},
{ GGML_TYPE_Q5_0,
{
{ MODEL_TINY, 30ull*MB },
@ -852,10 +861,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
model.type = e_model::MODEL_LARGE;
}
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
@ -877,7 +882,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
// print memory requirements
@ -1102,7 +1106,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
}
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
@ -1550,14 +1554,14 @@ static bool whisper_encode_internal(
Qcur),
Qcur);
//Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);
//Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
layer.attn_v_w,
@ -1617,12 +1621,12 @@ static bool whisper_encode_internal(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
struct ggml_tensor * KQ_scaled =
ggml_scale_inplace(ctx0,
ggml_scale(ctx0,
KQ,
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
struct ggml_tensor * V =
ggml_cpy(ctx0,
@ -1805,7 +1809,7 @@ static bool whisper_encode_internal(
layer.cross_attn_k_w,
cur);
Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
wstate.use_buf(ctx0, 1);
@ -1952,14 +1956,14 @@ static bool whisper_decode_internal(
Qcur),
Qcur);
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
// note: no bias for Key
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
layer.attn_k_w,
cur);
Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
// store key and value to memory
{
@ -2008,14 +2012,14 @@ static bool whisper_decode_internal(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
//struct ggml_tensor * KQ_scaled =
// ggml_scale_inplace(ctx0,
// ggml_scale(ctx0,
// KQ,
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
// );
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
@ -2079,7 +2083,7 @@ static bool whisper_decode_internal(
Qcur),
Qcur);
Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
// Kcross is already scaled
struct ggml_tensor * Kcross =
@ -2119,15 +2123,15 @@ static bool whisper_decode_internal(
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
//struct ggml_tensor * KQ_scaled =
// ggml_scale_inplace(ctx0,
// ggml_scale(ctx0,
// KQ,
// ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
// );
// no masking for cross-attention
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
//struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
@ -2598,15 +2602,6 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
path_bin = path_bin.substr(0, pos);
}
// match "-qx_x"
pos = path_bin.rfind('-');
if (pos != std::string::npos) {
auto sub = path_bin.substr(pos);
if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
path_bin = path_bin.substr(0, pos);
}
}
path_bin += "-encoder.mlmodelc";
return path_bin;
@ -4908,7 +4903,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// b: N*N*sizeof(float)
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
// put a bunch of random data in the buffer
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
@ -4916,6 +4911,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
for (int j = 0; j < (int) sizes.size(); j++) {
int n_q4_0 = 0;
int n_q4_1 = 0;
int n_q4_2 = 0;
int n_q5_0 = 0;
int n_q5_1 = 0;
int n_q8_0 = 0;
@ -4925,6 +4921,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// GFLOPS/s
double s_q4_0 = 0.0;
double s_q4_1 = 0.0;
double s_q4_2 = 0.0;
double s_q5_0 = 0.0;
double s_q5_1 = 0.0;
double s_q8_0 = 0.0;
@ -4933,17 +4930,18 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
const size_t N = sizes[j];
for (int k = 0; k < 7; ++k) {
for (int k = 0; k < 8; ++k) {
const ggml_type wtype =
k == 0 ? GGML_TYPE_Q4_0 :
k == 1 ? GGML_TYPE_Q4_1 :
k == 2 ? GGML_TYPE_Q5_0 :
k == 3 ? GGML_TYPE_Q5_1 :
k == 4 ? GGML_TYPE_Q8_0 :
k == 5 ? GGML_TYPE_F16 : GGML_TYPE_F32;
k == 2 ? GGML_TYPE_Q4_2 :
k == 3 ? GGML_TYPE_Q5_0 :
k == 4 ? GGML_TYPE_Q5_1 :
k == 5 ? GGML_TYPE_Q8_0 :
k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32;
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32;
int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32;
struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
@ -4987,9 +4985,9 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
s = ((2.0*N*N*N*n)/tsum)*1e-9;
}
// Q4_0 | Q4_1
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
// Q4_0 | Q4_1 | Q4_2
snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n",
N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2);
s += strbuf;
// Q5_0 | Q5_1 | Q8_0