cuda : use CUBLAS_COMPTE_F32 insted of CUBLAS_COMPUTE_F16

CI : Rectify the Clang-Related workflow issues (#1551 )
* fix bugs in workflow * fix missing clang in workflow * Update build.yml
2025-07-05 00:41:06 +02:00 · 2023-11-27 11:57:07 +02:00 · 2023-11-27 11:35:37 +02:00 · 2023-11-27 11:28:34 +02:00 · 2023-11-24 13:13:21 +02:00 · 2023-11-24 12:41:55 +02:00
45 changed files with 36040 additions and 1042 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -25,6 +25,7 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            set -e
            apt update
            apt install -y build-essential libsdl2-dev
            make
@ -86,6 +87,7 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            set -e
            apt update
            apt install -y build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
@ -113,8 +115,10 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            set -e
            apt update
-            apt install -y build-essential cmake libsdl2-dev
+            apt install -y clang
+            apt install -y clang build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
            make
            ctest -L gh --output-on-failure'
@ -140,6 +144,7 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
+            set -e
            apt update
            apt install -y build-essential cmake
            cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
@ -217,10 +222,10 @@ jobs:
        sdl2: [ON]
        include:
          - arch: Win32
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x86.zip
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
            s2arc: x86
          - arch: x64
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x64.zip
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
            s2arc: x64
          - sdl2: ON
            s2ver: 2.26.0
@ -320,6 +325,13 @@ jobs:
          cd ./build
          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}

+      - name: Copy CUDA DLLs
+        run: >
+          Copy-Item -PassThru
+          -Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
+          -Include cudart64_*,cublas64_*,cublasLt64_*
+          -Destination build/bin/${{ matrix.build }}
+
      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
--- a/.gitignore
+++ b/.gitignore
@ -31,6 +31,7 @@ build-sanitize-thread/
 /talk-llama
 /bench
 /quantize
+/server
 /lsp

 arm_neon.h
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.5)

-project(whisper.cpp VERSION 1.4.3)
+project(whisper.cpp VERSION 1.5.1)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/11
+++ b/11
@ -1,4 +1,4 @@
-default: main bench quantize
+default: main bench quantize server

 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@ -338,7 +338,7 @@ libwhisper.so: $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so $(WHISPER_OBJ) $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize server lsp libwhisper.a libwhisper.so

 #
 # Examples
@ -359,6 +359,9 @@ bench: examples/bench/bench.cpp $(WHISPER_OBJ)
 quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)

+server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
+	$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
+
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

@ -418,9 +421,9 @@ samples:
 .PHONY: medium
 .PHONY: large-v1
 .PHONY: large-v2
-.PHONY: large
+.PHONY: large-v3

-tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
 	bash ./models/download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Beta: [v1.4.3](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.3) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.5.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -231,18 +231,18 @@ make medium.en
 make medium
 make large-v1
 make large-v2
-make large
+make large-v3
 ```

 ## Memory usage

-| Model  | Disk   | Mem     | SHA                                        |
-| ---    | ---    | ---     | ---                                        |
-| tiny   |  75 MB | ~125 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
-| base   | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
-| small  | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
-| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| large  | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
+| Model  | Disk    | Mem      |
+| ---    | ---     | ---      |
+| tiny   |  75 MiB | ~273 MB |
+| base   | 142 MiB | ~388 MB |
+| small  | 466 MiB | ~852 MB |
+| medium | 1.5 GiB | ~2.1 GB |
+| large  | 2.9 GiB | ~3.9 GB |

 ## Quantization

@ -777,6 +777,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
 | [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
 | [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
+| [server](examples/server) | | HTTP transcription server with OAI-like API |

 ## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)

--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -1,9 +1,26 @@
+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
+endif
+
+ifndef UNAME_P
+UNAME_P := $(shell uname -p)
+endif
+
+ifndef UNAME_M
+UNAME_M := $(shell uname -m)
+endif
+
+GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
 INCLUDE_PATH := $(abspath ../..)
 LIBRARY_PATH := $(abspath ../..)

+ifeq ($(UNAME_S),Darwin)
+	EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
+endif
+
 all: clean whisper examples

 whisper: mkdir
@ -11,8 +28,13 @@ whisper: mkdir
 	@${MAKE} -C ../.. libwhisper.a

 test: model-small whisper modtidy
+ifeq ($(UNAME_S),Darwin)
+	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v .
+	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v ./pkg/whisper/...
+else
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v .
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./pkg/whisper/...
+endif

 examples: $(EXAMPLES_DIR)

@ -21,7 +43,11 @@ model-small: mkdir examples/go-model-download

 $(EXAMPLES_DIR): mkdir whisper modtidy
 	@echo Build example $(notdir $@)
+ifeq ($(UNAME_S),Darwin)
+	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go build ${BUILD_FLAGS} -ldflags "-extldflags '$(EXT_LDFLAGS)'" -o ${BUILD_DIR}/$(notdir $@) ./$@
+else
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@
+endif

 mkdir:
 	@echo Mkdir ${BUILD_DIR}
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -24,7 +24,7 @@ const (

 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
 )

 var (
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
+++ b/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
@ -45,7 +45,7 @@ class WhisperCppTest {
        assertEquals(16384, params.n_max_text_ctx);
        assertFalse(params.translate);
        assertEquals(0.01f, params.thold_pt);
-        assertEquals(2, params.beam_search.beam_size);
+        assertEquals(5, params.beam_search.beam_size);
        assertEquals(-1.0f, params.beam_search.patience);
    }

@ -58,7 +58,7 @@ class WhisperCppTest {
        assertEquals(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY.ordinal(), params.strategy);
        assertNotEquals(0, params.n_threads);
        assertEquals(16384, params.n_max_text_ctx);
-        assertEquals(2, params.greedy.best_of);
+        assertEquals(5, params.greedy.best_of);
    }

    @Test
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.4.3",
+  "version": "1.5.1",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -65,6 +65,7 @@ elseif(CMAKE_JS_VERSION)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
+    add_subdirectory(server)
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(quantize)
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -81,7 +81,7 @@ int whisper_bench_full(const whisper_params & params) {
    }
    // heat encoder
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        fprintf(stderr, "error: failed to encode: %d\n", ret);
        return 4;
    }

@ -90,13 +90,13 @@ int whisper_bench_full(const whisper_params & params) {

    // prompt heat
    if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        fprintf(stderr, "error: failed to decode: %d\n", ret);
        return 4;
    }

    // text-generation heat
    if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        fprintf(stderr, "error: failed to decode: %d\n", ret);
        return 4;
    }

@ -104,20 +104,30 @@ int whisper_bench_full(const whisper_params & params) {

    // actual run
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
-        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        fprintf(stderr, "error: failed to encode: %d\n", ret);
        return 4;
    }

-    for (int i = 0; i < 16; i++) {
-        if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
-            fprintf(stderr, "error: failed to encode model: %d\n", ret);
-            return 4;
-        }
-    }
-
+    // text-generation
    for (int i = 0; i < 256; i++) {
        if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
-            fprintf(stderr, "error: failed to encode model: %d\n", ret);
+            fprintf(stderr, "error: failed to decode: %d\n", ret);
+            return 4;
+        }
+    }
+
+    // batched decoding
+    for (int i = 0; i < 64; i++) {
+        if (int ret = whisper_decode(ctx, tokens, 5, 0, params.n_threads) != 0) {
+            fprintf(stderr, "error: failed to decode: %d\n", ret);
+            return 4;
+        }
+    }
+
+    // prompt processing
+    for (int i = 0; i < 16; i++) {
+        if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
+            fprintf(stderr, "error: failed to decode: %d\n", ret);
            return 4;
        }
    }
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -9,6 +9,11 @@ static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+    {"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
+    {"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
+    {"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
+    {"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
+    {"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
 };

 void ggml_print_ftypes(FILE * fp) {
@ -48,15 +53,15 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
+        case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
+        case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
+        case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
+        case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
        case GGML_FTYPE_UNKNOWN:
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        case GGML_FTYPE_MOSTLY_Q2_K:
-        case GGML_FTYPE_MOSTLY_Q3_K:
-        case GGML_FTYPE_MOSTLY_Q4_K:
-        case GGML_FTYPE_MOSTLY_Q5_K:
-        case GGML_FTYPE_MOSTLY_Q6_K:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -167,24 +172,17 @@ bool ggml_common_quantize_0(

            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
-                    {
-                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
                case GGML_TYPE_Q4_1:
-                    {
-                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
                case GGML_TYPE_Q5_0:
-                    {
-                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
                case GGML_TYPE_Q5_1:
-                    {
-                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
-                    } break;
                case GGML_TYPE_Q8_0:
+                case GGML_TYPE_Q2_K:
+                case GGML_TYPE_Q3_K:
+                case GGML_TYPE_Q4_K:
+                case GGML_TYPE_Q5_K:
+                case GGML_TYPE_Q6_K:
                    {
-                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
@ -192,11 +190,6 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_I16:
                case GGML_TYPE_I32:
                case GGML_TYPE_Q8_1:
-                case GGML_TYPE_Q2_K:
-                case GGML_TYPE_Q3_K:
-                case GGML_TYPE_Q4_K:
-                case GGML_TYPE_Q5_K:
-                case GGML_TYPE_Q6_K:
                case GGML_TYPE_Q8_K:
                case GGML_TYPE_COUNT:
                    {
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -139,10 +139,13 @@ void audio_async::callback(uint8_t * stream, int len) {
        return;
    }

-    const size_t n_samples = len / sizeof(float);
+    size_t n_samples = len / sizeof(float);

-    m_audio_new.resize(n_samples);
-    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+    if (n_samples > m_audio.size()) {
+        n_samples = m_audio.size();
+
+        stream += (len - (n_samples * sizeof(float)));
+    }

    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);

@ -153,7 +156,7 @@ void audio_async::callback(uint8_t * stream, int len) {
            const size_t n0 = m_audio.size() - m_audio_pos;

            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+            memcpy(&m_audio[0], stream + n0 * sizeof(float), (n_samples - n0) * sizeof(float));

            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
            m_audio_len = m_audio.size();
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -41,7 +41,6 @@ private:
    std::mutex       m_mutex;

    std::vector<float> m_audio;
-    std::vector<float> m_audio_new;
    size_t             m_audio_pos = 0;
    size_t             m_audio_len = 0;
 };
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
 fi

 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )

 # list available models
 function list_models {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -62,8 +62,8 @@ struct whisper_params {
    int32_t progress_step =  5;
    int32_t max_context  = -1;
    int32_t max_len      =  0;
-    int32_t best_of      =  2;
-    int32_t beam_size    = -1;
+    int32_t best_of      = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
+    int32_t beam_size    = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;

    float word_thold    =  0.01f;
    float entropy_thold =  2.40f;
@ -925,9 +925,9 @@ int main(int argc, char ** argv) {
            if (params.detect_language) {
                params.language = "auto";
            }
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
+            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, %d beams + best of %d, lang = %s, task = %s, %stimestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, params.n_processors,
+                    params.n_threads, params.n_processors, params.beam_size, params.best_of,
                    params.language.c_str(),
                    params.translate ? "translate" : "transcribe",
                    params.tinydiarize ? "tdrz = 1, " : "",
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -0,0 +1,6 @@
+set(TARGET server)
+add_executable(${TARGET} server.cpp httplib.h json.hpp)
+
+include(DefaultTargetOptions)
+
+target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -0,0 +1,68 @@
+# whisper.cpp http server
+
+Simple http server. WAV Files are passed to the inference model via http requests.
+
+https://github.com/ggerganov/whisper.cpp/assets/1991296/e983ee53-8741-4eb5-9048-afe5e4594b8f
+
+## Usage
+
+```
+./server -h
+
+usage: ./bin/server [options]
+
+options:
+  -h,        --help              [default] show this help message and exit
+  -t N,      --threads N         [4      ] number of threads to use during computation
+  -p N,      --processors N      [1      ] number of processors to use during computation
+  -ot N,     --offset-t N        [0      ] time offset in milliseconds
+  -on N,     --offset-n N        [0      ] segment index offset
+  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
+  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
+  -ml N,     --max-len N         [0      ] maximum segment length in characters
+  -sow,      --split-on-word     [false  ] split on word rather than on token
+  -bo N,     --best-of N         [2      ] number of best candidates to keep
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
+  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
+  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
+  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
+  -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
+  -tr,       --translate         [false  ] translate from source language to english
+  -di,       --diarize           [false  ] stereo audio diarization
+  -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
+  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
+  -ps,       --print-special     [false  ] print special tokens
+  -pc,       --print-colors      [false  ] print colors
+  -pr,       --print-realtime    [false  ] print output in realtime
+  -pp,       --print-progress    [false  ] print progress
+  -nt,       --no-timestamps     [false  ] do not print timestamps
+  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
+  -dl,       --detect-language   [false  ] exit after automatically detecting language
+             --prompt PROMPT     [       ] initial prompt
+  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
+  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
+  --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
+  --port PORT,                   [8080   ] Port number for the server
+  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
+```
+
+> [!WARNING]  
+> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
+
+## request examples
+
+**/inference**
+```
+curl 127.0.0.1:8080/inference \
+-H "Content-Type: multipart/form-data" \
+-F file="@<file-path>" \
+-F temperature="0.2" \
+-F response-format="json"
+```
+
+**/load**
+```
+curl 127.0.0.1:8080/load \
+-H "Content-Type: multipart/form-data" \
+-F model="<path-to-model-file>"
+```
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -0,0 +1,762 @@
+#include "common.h"
+
+#include "whisper.h"
+#include "httplib.h"
+#include "json.hpp"
+
+#include <cmath>
+#include <fstream>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <vector>
+#include <cstring>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+using namespace httplib;
+using json = nlohmann::json;
+
+namespace {
+
+// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
+// Lowest is red, middle is yellow, highest is green.
+const std::vector<std::string> k_colors = {
+    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
+    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
+};
+
+// output formats
+const std::string json_format   = "json";
+const std::string text_format   = "text";
+const std::string srt_format    = "srt";
+const std::string vjson_format  = "verbose_json";
+const std::string vtt_format    = "vtt";
+
+struct server_params
+{
+    std::string hostname = "127.0.0.1";
+    std::string public_path = "examples/server/public";
+
+    int32_t port          = 8080;
+    int32_t read_timeout  = 600;
+    int32_t write_timeout = 600;
+    
+    bool ffmpeg_converter = false;
+};
+
+struct whisper_params {
+    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_processors =  1;
+    int32_t offset_t_ms  =  0;
+    int32_t offset_n     =  0;
+    int32_t duration_ms  =  0;
+    int32_t progress_step =  5;
+    int32_t max_context  = -1;
+    int32_t max_len      =  0;
+    int32_t best_of      =  2;
+    int32_t beam_size    = -1;
+
+    float word_thold    =  0.01f;
+    float entropy_thold =  2.40f;
+    float logprob_thold = -1.00f;
+    float userdef_temp  =  0.20f;
+
+    bool speed_up        = false;
+    bool debug_mode      = false;
+    bool translate       = false;
+    bool detect_language = false;
+    bool diarize         = false;
+    bool tinydiarize     = false;
+    bool split_on_word   = false;
+    bool no_fallback     = false;
+    bool print_special   = false;
+    bool print_colors    = false;
+    bool print_realtime  = false;
+    bool print_progress  = false;
+    bool no_timestamps   = false;
+    bool use_gpu         = true;
+
+    std::string language        = "en";
+    std::string prompt          = "";
+    std::string font_path       = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
+    std::string model           = "models/ggml-base.en.bin";
+
+    std::string response_format     = json_format;
+
+    // [TDRZ] speaker turn string
+    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
+
+    std::string openvino_encode_device = "CPU";
+};
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma = false) {
+    int64_t msec = t * 10;
+    int64_t hr = msec / (1000 * 60 * 60);
+    msec = msec - hr * (1000 * 60 * 60);
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
+
+    return std::string(buf);
+}
+
+int timestamp_to_sample(int64_t t, int n_samples) {
+    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
+}
+
+bool is_file_exist(const char *fileName)
+{
+    std::ifstream infile(fileName);
+    return infile.good();
+}
+
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params,
+                         const server_params& sparams) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options] \n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,        --help              [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,      --threads N         [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,      --processors N      [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,     --offset-n N        [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
+    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
+    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
+    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
+    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
+    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
+    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
+    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
+    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -pr,       --print-realtime    [%-7s] print output in realtime\n",                       params.print_realtime ? "true" : "false");
+    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
+    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
+    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
+    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
+    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
+    // server params
+    fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
+    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
+    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
+    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
+    fprintf(stderr, "\n");
+}
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params, server_params & sparams) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params, sparams);
+            exit(0);
+        }
+        else if (arg == "-t"    || arg == "--threads")         { params.n_threads       = std::stoi(argv[++i]); }
+        else if (arg == "-p"    || arg == "--processors")      { params.n_processors    = std::stoi(argv[++i]); }
+        else if (arg == "-ot"   || arg == "--offset-t")        { params.offset_t_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-on"   || arg == "--offset-n")        { params.offset_n        = std::stoi(argv[++i]); }
+        else if (arg == "-d"    || arg == "--duration")        { params.duration_ms     = std::stoi(argv[++i]); }
+        else if (arg == "-mc"   || arg == "--max-context")     { params.max_context     = std::stoi(argv[++i]); }
+        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
+        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
+        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
+        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
+        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
+        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
+        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
+        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
+        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
+        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
+        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
+        else if (arg == "-sow"  || arg == "--split-on-word")   { params.split_on_word   = true; }
+        else if (arg == "-nf"   || arg == "--no-fallback")     { params.no_fallback     = true; }
+        else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = argv[++i]; }
+        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
+        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
+        else if (arg == "-pr"   || arg == "--print-realtime")  { params.print_realtime  = true; }
+        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
+        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
+        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
+        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
+        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
+        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
+        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
+        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
+        // server params
+        else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
+        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
+        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
+        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params, sparams);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+struct whisper_print_user_data {
+    const whisper_params * params;
+
+    const std::vector<std::vector<float>> * pcmf32s;
+    int progress_prev;
+};
+
+void check_ffmpeg_availibility() {
+    int result = system("ffmpeg -version");
+
+    if (result == 0) {
+        std::cout << "ffmpeg is available." << std::endl;
+    } else {
+        // ffmpeg is not available
+        std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
+        std::cout << "and that its executable is included in your system's PATH. ";
+        exit(0);
+    }
+}
+
+bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
+    std::ostringstream cmd_stream;
+    std::string converted_filename_temp = temp_filename + "_temp.wav";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    std::string cmd = cmd_stream.str();
+
+    int status = std::system(cmd.c_str());
+    if (status != 0) {
+        error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
+        return false;
+    }
+
+    // Remove the original file
+    if (remove(temp_filename.c_str()) != 0) {
+        error_resp = "{\"error\":\"Failed to remove the original file.\"}";
+        return false;
+    }
+
+    // Rename the temporary file to match the original filename
+    if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
+        error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
+        return false;
+    }
+    return true;
+}
+
+std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
+    std::string speaker = "";
+    const int64_t n_samples = pcmf32s[0].size();
+
+    const int64_t is0 = timestamp_to_sample(t0, n_samples);
+    const int64_t is1 = timestamp_to_sample(t1, n_samples);
+
+    double energy0 = 0.0f;
+    double energy1 = 0.0f;
+
+    for (int64_t j = is0; j < is1; j++) {
+        energy0 += fabs(pcmf32s[0][j]);
+        energy1 += fabs(pcmf32s[1][j]);
+    }
+
+    if (energy0 > 1.1*energy1) {
+        speaker = "0";
+    } else if (energy1 > 1.1*energy0) {
+        speaker = "1";
+    } else {
+        speaker = "?";
+    }
+
+    //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = %s\n", is0, is1, energy0, energy1, speaker.c_str());
+
+    if (!id_only) {
+        speaker.insert(0, "(speaker ");
+        speaker.append(")");
+    }
+
+    return speaker;
+}
+
+void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
+    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
+    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
+    if (progress >= *progress_prev + progress_step) {
+        *progress_prev += progress_step;
+        fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
+    }
+}
+
+void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
+    const auto & params  = *((whisper_print_user_data *) user_data)->params;
+    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+
+    std::string speaker = "";
+
+    int64_t t0 = 0;
+    int64_t t1 = 0;
+
+    // print the last n_new segments
+    const int s0 = n_segments - n_new;
+
+    if (s0 == 0) {
+        printf("\n");
+    }
+
+    for (int i = s0; i < n_segments; i++) {
+        if (!params.no_timestamps || params.diarize) {
+            t0 = whisper_full_get_segment_t0(ctx, i);
+            t1 = whisper_full_get_segment_t1(ctx, i);
+        }
+
+        if (!params.no_timestamps) {
+            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+        }
+
+        if (params.diarize && pcmf32s.size() == 2) {
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        if (params.print_colors) {
+            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                if (params.print_special == false) {
+                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                    if (id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+                }
+
+                const char * text = whisper_full_get_token_text(ctx, i, j);
+                const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+            }
+        } else {
+            const char * text = whisper_full_get_segment_text(ctx, i);
+
+            printf("%s%s", speaker.c_str(), text);
+        }
+
+        if (params.tinydiarize) {
+            if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
+                printf("%s", params.tdrz_speaker_turn.c_str());
+            }
+        }
+
+        // with timestamps or speakers: each segment on new line
+        if (!params.no_timestamps || params.diarize) {
+            printf("\n");
+        }
+        fflush(stdout);
+    }
+}
+
+std::string output_str(struct whisper_context * ctx, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+    std::stringstream result;
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        std::string speaker = "";
+
+        if (params.diarize && pcmf32s.size() == 2)
+        {
+            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+            speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
+        }
+
+        result << speaker << text << "\n";
+    }
+    return result.str();
+}
+
+void get_req_parameters(const Request & req, whisper_params & params)
+{
+    // user model configu.has_fileion
+    if (req.has_file("offset-t"))
+    {
+        params.offset_t_ms = std::stoi(req.get_file_value("offset-t").content);
+    }
+    if (req.has_file("offset-n"))
+    {
+        params.offset_n = std::stoi(req.get_file_value("offset-n").content);
+    }
+    if (req.has_file("duration"))
+    {
+        params.duration_ms = std::stoi(req.get_file_value("duration").content);
+    }
+    if (req.has_file("max-context"))
+    {
+        params.max_context = std::stoi(req.get_file_value("max-context").content);
+    }
+    if (req.has_file("prompt"))
+    {
+        params.prompt = req.get_file_value("prompt").content;
+    }
+    if (req.has_file("response-format"))
+    {
+        params.response_format = req.get_file_value("response-format").content;
+    }
+    if (req.has_file("temperature"))
+    {
+        params.userdef_temp = std::stof(req.get_file_value("temperature").content);
+    }
+}
+
+}  // namespace
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+    server_params sparams;
+
+    std::mutex whisper_mutex;
+
+    if (whisper_params_parse(argc, argv, params, sparams) == false) {
+        whisper_print_usage(argc, argv, params, sparams);
+        return 1;
+    }
+
+    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params, sparams);
+        exit(0);
+    }
+
+    if (params.diarize && params.tinydiarize) {
+        fprintf(stderr, "error: cannot use both --diarize and --tinydiarize\n");
+        whisper_print_usage(argc, argv, params, sparams);
+        exit(0);
+    }
+
+    if (sparams.ffmpeg_converter) {
+        check_ffmpeg_availibility();
+    }
+    // whisper init
+    struct whisper_context_params cparams;
+    cparams.use_gpu = params.use_gpu;
+
+    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
+
+    if (ctx == nullptr) {
+        fprintf(stderr, "error: failed to initialize whisper context\n");
+        return 3;
+    }
+
+    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
+    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
+
+    Server svr;
+
+    std::string const default_content = "<html>hello</html>";
+
+    // this is only called if no index.html is found in the public --path
+    svr.Get("/", [&default_content](const Request &, Response &res){
+        res.set_content(default_content, "text/html");
+        return false;
+    });
+
+    svr.Post("/inference", [&](const Request &req, Response &res){
+        // acquire whisper model mutex lock
+        whisper_mutex.lock();
+
+        // first check user requested fields of the request
+        if (!req.has_file("file"))
+        {
+            fprintf(stderr, "error: no 'file' field in the request\n");
+            const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
+            res.set_content(error_resp, "application/json");
+            whisper_mutex.unlock();
+            return;
+        }
+        auto audio_file = req.get_file_value("file");
+
+        // check non-required fields
+        get_req_parameters(req, params);
+
+        std::string filename{audio_file.filename};
+        printf("Received request: %s\n", filename.c_str());
+
+        // audio arrays
+        std::vector<float> pcmf32;               // mono-channel F32 PCM
+        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
+
+        // write to temporary file
+        const std::string temp_filename = "whisper_server_temp_file.wav";
+        std::ofstream temp_file{temp_filename, std::ios::binary};
+        temp_file << audio_file.content;
+        temp_file.close();
+
+        // if file is not wav, convert to wav
+        
+        if (sparams.ffmpeg_converter) {
+            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
+            const bool is_converted = convert_to_wav(temp_filename, error_resp);
+            if (!is_converted) {
+                res.set_content(error_resp, "application/json");
+                whisper_mutex.unlock();
+                return;
+            }
+        }
+
+        // read wav content into pcmf32
+        if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
+            fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
+            const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+            res.set_content(error_resp, "application/json");
+            std::remove(temp_filename.c_str());
+            whisper_mutex.unlock();
+            return;
+        }
+        // remove temp file
+        std::remove(temp_filename.c_str());
+
+        printf("Successfully loaded %s\n", filename.c_str());
+
+        // print system information
+        {
+            fprintf(stderr, "\n");
+            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
+                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
+        }
+
+        // print some info about the processing
+        {
+            fprintf(stderr, "\n");
+            if (!whisper_is_multilingual(ctx)) {
+                if (params.language != "en" || params.translate) {
+                    params.language = "en";
+                    params.translate = false;
+                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+                }
+            }
+            if (params.detect_language) {
+                params.language = "auto";
+            }
+            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, %stimestamps = %d ...\n",
+                    __func__, filename.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
+                    params.n_threads, params.n_processors,
+                    params.language.c_str(),
+                    params.translate ? "translate" : "transcribe",
+                    params.tinydiarize ? "tdrz = 1, " : "",
+                    params.no_timestamps ? 0 : 1);
+
+            fprintf(stderr, "\n");
+        }
+
+        // run the inference
+        {
+            printf("Running whisper.cpp inference on %s\n", filename.c_str());
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
+
+            wparams.print_realtime   = false;
+            wparams.print_progress   = params.print_progress;
+            wparams.print_timestamps = !params.no_timestamps;
+            wparams.print_special    = params.print_special;
+            wparams.translate        = params.translate;
+            wparams.language         = params.language.c_str();
+            wparams.detect_language  = params.detect_language;
+            wparams.n_threads        = params.n_threads;
+            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
+            wparams.offset_ms        = params.offset_t_ms;
+            wparams.duration_ms      = params.duration_ms;
+
+            wparams.thold_pt         = params.word_thold;
+            wparams.split_on_word    = params.split_on_word;
+
+            wparams.speed_up         = params.speed_up;
+            wparams.debug_mode       = params.debug_mode;
+
+            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
+
+            wparams.initial_prompt   = params.prompt.c_str();
+
+            wparams.greedy.best_of        = params.best_of;
+            wparams.beam_search.beam_size = params.beam_size;
+
+            wparams.temperature_inc  = params.userdef_temp;
+            wparams.entropy_thold    = params.entropy_thold;
+            wparams.logprob_thold    = params.logprob_thold;
+
+            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
+
+            // this callback is called on each new segment
+            if (params.print_realtime) {
+                wparams.new_segment_callback           = whisper_print_segment_callback;
+                wparams.new_segment_callback_user_data = &user_data;
+            }
+
+            if (wparams.print_progress) {
+                wparams.progress_callback           = whisper_print_progress_callback;
+                wparams.progress_callback_user_data = &user_data;
+            }
+
+            // examples for abort mechanism
+            // in examples below, we do not abort the processing, but we could if the flag is set to true
+
+            // the callback is called before every encoder run - if it returns false, the processing is aborted
+            {
+                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
+
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
+                    bool is_aborted = *(bool*)user_data;
+                    return !is_aborted;
+                };
+                wparams.encoder_begin_callback_user_data = &is_aborted;
+            }
+
+            // the callback is called before every computation - if it returns true, the computation is aborted
+            {
+                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
+
+                wparams.abort_callback = [](void * user_data) {
+                    bool is_aborted = *(bool*)user_data;
+                    return is_aborted;
+                };
+                wparams.abort_callback_user_data = &is_aborted;
+            }
+
+            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
+                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                const std::string error_resp = "{\"error\":\"failed to process audio\"}";
+                res.set_content(error_resp, "application/json");
+                whisper_mutex.unlock();
+                return;
+            }
+        }
+
+        // return results to user
+        if (params.response_format == text_format)
+        {
+            std::string results = output_str(ctx, params, pcmf32s);
+            res.set_content(results.c_str(), "text/html");
+        }
+        // TODO add more output formats
+        else
+        {
+            std::string results = output_str(ctx, params, pcmf32s);
+            json jres = json{
+                {"text", results}
+            };
+            res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
+                            "application/json");
+        }
+
+        // return whisper model mutex lock
+        whisper_mutex.unlock();
+    });
+    svr.Post("/load", [&](const Request &req, Response &res){
+        whisper_mutex.lock();
+        if (!req.has_file("model"))
+        {
+            fprintf(stderr, "error: no 'model' field in the request\n");
+            const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
+            res.set_content(error_resp, "application/json");
+            whisper_mutex.unlock();
+            return;
+        }
+        std::string model = req.get_file_value("model").content;
+        if (!is_file_exist(model.c_str()))
+        {
+            fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
+            const std::string error_resp = "{\"error\":\"model not found!\"}";
+            res.set_content(error_resp, "application/json");
+            whisper_mutex.unlock();
+            return;
+        }
+
+        // clean up
+        whisper_free(ctx);
+
+        // whisper init
+        ctx = whisper_init_from_file_with_params(model.c_str(), cparams);
+
+        // TODO perhaps load prior model here instead of exit
+        if (ctx == nullptr) {
+            fprintf(stderr, "error: model init  failed, no model loaded must exit\n");
+            exit(1);
+        }
+
+        // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
+        whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
+
+        const std::string success = "Load was successful!";
+        res.set_content(success, "application/text");
+
+        // check if the model is in the file system
+        whisper_mutex.unlock();
+    });
+
+    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
+        const char fmt[] = "500 Internal Server Error\n%s";
+        char buf[BUFSIZ];
+        try {
+            std::rethrow_exception(std::move(ep));
+        } catch (std::exception &e) {
+            snprintf(buf, sizeof(buf), fmt, e.what());
+        } catch (...) {
+            snprintf(buf, sizeof(buf), fmt, "Unknown Exception");
+        }
+        res.set_content(buf, "text/plain");
+        res.status = 500;
+    });
+
+    svr.set_error_handler([](const Request &, Response &res) {
+        if (res.status == 400) {
+            res.set_content("Invalid request", "text/plain");
+        } else if (res.status != 500) {
+            res.set_content("File Not Found", "text/plain");
+            res.status = 404;
+        }
+    });
+
+    // set timeouts and change hostname and port
+    svr.set_read_timeout(sparams.read_timeout);
+    svr.set_write_timeout(sparams.write_timeout);
+
+    if (!svr.bind_to_port(sparams.hostname, sparams.port))
+    {
+        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
+                sparams.hostname.c_str(), sparams.port);
+        return 1;
+    }
+
+    // Set the base directory for serving static files
+    svr.set_base_dir(sparams.public_path);
+
+    // to make it ctrl+clickable:
+    printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
+
+    if (!svr.listen_after_bind())
+    {
+        return 1;
+    }
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -53,7 +53,7 @@ struct whisper_params {
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
-    int32_t n_gpu_layers = 0;
+    int32_t n_gpu_layers = 999;

    float vad_thold  = 0.6f;
    float freq_thold = 100.0f;
@ -136,7 +136,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -c ID,    --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7s] number of layers to store in VRAM\n",           params.n_gpu_layers);
+    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
@ -686,8 +686,8 @@ int main(int argc, char ** argv) {
                    }
                }

-                text_to_speak = ::replace(text_to_speak, "\"", "");
-                int ret = system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
+                int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
                if (ret != 0) {
                    fprintf(stderr, "%s: failed to speak\n", __func__);
                }
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@ -21,7 +21,7 @@ help()
    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
    echo "options:"
    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
    echo "-t       Number of threads to use."
    echo "-h       Print this help page."
    echo
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -21,8 +21,8 @@ models=(                                               \
      "tiny"     "tiny-q4_0"     "tiny-q4_1"     "tiny-q5_0"     "tiny-q5_1"     "tiny-q8_0"                \
      "base"     "base-q4_0"     "base-q4_1"     "base-q5_0"     "base-q5_1"     "base-q8_0"                \
     "small"    "small-q4_0"    "small-q4_1"    "small-q5_0"    "small-q5_1"    "small-q8_0"                \
-    "medium" "medium-q4_0" "medium-q4_1" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
-     "large"  "large-q4_0"  "large-q4_1"  "large-q5_0"  "large-q5_1"  "large-q8_0" \
+    "medium"   "medium-q4_0"   "medium-q4_1"   "medium-q5_0"   "medium-q5_1"   "medium-q8_0"   "medium-dis" \
+  "large-v2" "large-v2-q4_0" "large-v2-q4_1" "large-v2-q5_0" "large-v2-q5_1" "large-v2-q8_0" "large-v2-dis" \
 )

 if [ "$encoder_only" -eq 0 ]; then
@ -44,8 +44,8 @@ if [ "$encoder_only" -eq 0 ]; then
    printf "\n"
 fi

-printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "PP" "Commit"
-printf "| %6s | %6s | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---"
+printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "CPU" "OS" "Config" "Model" "Th" "Enc." "Dec." "Bch5" "PP" "Commit"
+printf "| %6s | %6s | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "---" "---" "---" "---" "---" "---" "---" "---" "---" "---"

 for model in "${models[@]}"; do
    # actual run
@ -56,6 +56,7 @@ for model in "${models[@]}"; do
    # parse the output:
    encode_time=$(echo "$output" | grep "encode time" | awk '{print $11}')
    decode_time=$(echo "$output" | grep "decode time" | awk '{print $11}')
+    batchd_time=$(echo "$output" | grep "batchd time" | awk '{print $11}')
    prompt_time=$(echo "$output" | grep "prompt time" | awk '{print $11}')
    system_info=$(echo "$output" | grep "system_info")
    n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
@ -94,6 +95,6 @@ for model in "${models[@]}"; do
    commit=$(git rev-parse --short HEAD)

    if [ $ret -eq 0 ]; then
-        printf "| <todo> | <todo> | %16s | %11s | %3s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$prompt_time" "$commit"
+        printf "| <todo> | <todo> | %16s | %13s | %3s | %7s | %7s | %7s | %7s | %7s |\n" "$config" "$model" "$n_threads" "$encode_time" "$decode_time" "$batchd_time" "$prompt_time" "$commit"
    fi
 done
--- a/extra/convert-all.sh
+++ b/extra/convert-all.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )

 for model in "${models[@]}"; do
    python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -446,12 +446,14 @@ static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * n
    return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
 }

-static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view) {
+static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
    ggml_tallocr_t alloc = node_tallocr(galloc, view);

    //printf("init_view: %s from src %s\n", view->name, view->view_src->name);
    GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
+    if (update_backend) {
        view->backend = view->view_src->backend;
+    }
    view->buffer  = view->view_src->buffer;
    view->data    = (char *)view->view_src->data + view->view_offs;

@ -469,7 +471,7 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {

    if (node->data == NULL) {
        if (ggml_is_view(node)) {
-            init_view(galloc, node);
+            init_view(galloc, node, true);
        } else {
            // see if we can reuse a parent's buffer (inplace)
            if (ggml_op_can_inplace(node->op)) {
@ -499,15 +501,14 @@ static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
                                AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
                                node->view_src = view_src;
                                view_src_hn->n_views += 1;
-                                init_view(galloc, node);
+                                init_view(galloc, node, false);
                                return;
                            }
-                        }
-                        else {
+                        } else {
                            AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
                            node->view_src = parent;
                            p_hn->n_views += 1;
-                            init_view(galloc, node);
+                            init_view(galloc, node, false);
                            return;
                        }
                    }
@ -537,7 +538,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
            hash_get(galloc, view_src)->n_views += 1;
            if (node->buffer == NULL && node->data != NULL) {
                // view of a pre-allocated tensor, didn't call init_view() yet
-                init_view(galloc, node);
+                init_view(galloc, node, true);
            }
        }

@ -548,7 +549,7 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
            }
            hash_get(galloc, parent)->n_children += 1;
            if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
-                init_view(galloc, parent);
+                init_view(galloc, parent, true);
            }
        }
   }
@ -663,7 +664,7 @@ size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, st
    return max_size;
 }

-void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_alloct) {
+void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
    const size_t hash_size = hash_set.size;

    GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
@ -686,7 +687,7 @@ void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * grap
    // reset hash values
    memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);

-    galloc->hash_allocs = hash_node_alloct;
+    galloc->hash_allocs = hash_node_talloc;

    ggml_tallocr_alloc_graph_impl(galloc, graph);

--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -17,7 +17,12 @@ extern "C" {

 #define GGML_CUDA_MAX_DEVICES       16

+// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
 GGML_API void   ggml_init_cublas(void);
+
+// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
+GGML_API bool   ggml_cublas_loaded(void);
+
 GGML_API void * ggml_cuda_host_malloc(size_t size);
 GGML_API void   ggml_cuda_host_free(void * ptr);

--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -52,6 +52,11 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
 void * ggml_metal_host_malloc(size_t n);
 void   ggml_metal_host_free  (void * data);

+// helper to check if the device supports a specific family
+// ideally, the user code should be doing these checks
+// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
+bool ggml_metal_supports_family(struct ggml_metal_context * ctx, int family);
+
 // set the number of command buffers to use
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);

@ -100,6 +105,8 @@ GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);

+GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -346,9 +346,9 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
    }

    GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
-    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+    GGML_METAL_LOG_INFO("%s: recommendedMaxWorkingSetSize  = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1e6);
    if (ctx->device.maxTransferRate != 0) {
-        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
+        GGML_METAL_LOG_INFO("%s: maxTransferRate               = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1e6);
    } else {
        GGML_METAL_LOG_INFO("%s: maxTransferRate               = built-in GPU\n", __func__);
    }
@ -459,6 +459,10 @@ void ggml_metal_host_free(void * data) {
    free(data);
 }

+bool ggml_metal_supports_family(struct ggml_metal_context * ctx, int family) {
+    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+}
+
 void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
    ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
 }
@ -541,11 +545,11 @@ bool ggml_metal_add_buffer(
            ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];

            if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
+                GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1e6);
                return false;
            }

-            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
+            GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1e6);

            ++ctx->n_buffers;
        } else {
@ -565,11 +569,11 @@ bool ggml_metal_add_buffer(
                ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];

                if (ctx->buffers[ctx->n_buffers].metal == nil) {
-                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
+                    GGML_METAL_LOG_ERROR("%s: error: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1e6);
                    return false;
                }

-                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
+                GGML_METAL_LOG_INFO("%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1e6, i);
                if (i + size_step < size) {
                    GGML_METAL_LOG_INFO("\n");
                }
@ -580,8 +584,8 @@ bool ggml_metal_add_buffer(

 #if TARGET_OS_OSX
        GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
-                ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
-                ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+                ctx->device.currentAllocatedSize / 1e6,
+                ctx->device.recommendedMaxWorkingSetSize / 1e6);

        if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
            GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
@ -589,7 +593,7 @@ bool ggml_metal_add_buffer(
            GGML_METAL_LOG_INFO("\n");
        }
 #else
-        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1024.0 / 1024.0);
+        GGML_METAL_LOG_INFO(", (%8.2f)\n", ctx->device.currentAllocatedSize / 1e6);
 #endif
    }

@ -1072,7 +1076,7 @@ void ggml_metal_graph_compute(
                            GGML_ASSERT(ne00 == ne10);
                            GGML_ASSERT(ne03 == ne13);

-                            const uint gqa = ne12/ne02;
+                            const unsigned int gqa = ne12/ne02;

                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                            // to the matrix-vector kernel
@ -1751,3 +1755,9 @@ void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {

    ggml_metal_set_n_cb(ctx, n_cb);
 }
+
+bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family) {
+    struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+    return ggml_metal_supports_family(ctx, family);
+}
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -1368,7 +1368,12 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f
    float max = x[0];
    float sum_w = weights[0];
    float sum_x = sum_w * x[0];
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 1; i < n; ++i) {
+#else
    for (int i = 1; i < n; ++i) {
+#endif
        if (x[i] < min) min = x[i];
        if (x[i] > max) max = x[i];
        float w = weights[i];
--- a/ggml.c
+++ b/ggml.c
@ -5024,8 +5024,13 @@ struct ggml_tensor * ggml_rope_back(
        int                   n_dims,
        int                   mode,
        int                   n_ctx,
+        int                   n_orig_ctx,
        float                 freq_base,
        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
        float                 xpos_base,
        bool                  xpos_down) {
    GGML_ASSERT(ggml_is_vector(b));
@ -5042,11 +5047,15 @@ struct ggml_tensor * ggml_rope_back(

    struct ggml_tensor * result = ggml_dup_tensor(ctx, a);

-    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base,  sizeof(float));
-    memcpy(params + 5, &freq_scale, sizeof(float));
-    memcpy(params + 6, &xpos_base,  sizeof(float));
-    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
    ggml_set_op_params(result, params, sizeof(params));

    result->op   = GGML_OP_ROPE_BACK;
@ -9376,7 +9385,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
 }
 #endif

-
 static void ggml_compute_forward_mul_mat(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -10946,7 +10954,8 @@ static void ggml_compute_forward_rope_f32(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
+        const bool forward) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@ -11005,6 +11014,11 @@ static void ggml_compute_forward_rope_f32(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11021,9 +11035,9 @@ static void ggml_compute_forward_rope_f32(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@ -11047,6 +11061,7 @@ static void ggml_compute_forward_rope_f32(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
+                        sin_theta *= sin_sign;

                        // zeta scaling for xPos only:
                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@ -11077,6 +11092,7 @@ static void ggml_compute_forward_rope_f32(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
+                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

@ -11102,7 +11118,8 @@ static void ggml_compute_forward_rope_f16(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+        struct ggml_tensor * dst,
+        const bool forward) {
    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
        return;
    }
@ -11154,6 +11171,11 @@ static void ggml_compute_forward_rope_f16(
    const bool is_neox = mode & 2;
    const bool is_glm  = mode & 4;

+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
+
    const int32_t * pos = (const int32_t *) src1->data;

    for (int64_t i3 = 0; i3 < ne3; i3++) {
@ -11170,9 +11192,9 @@ static void ggml_compute_forward_rope_f16(
                    float block_theta = MAX(p - (n_ctx - 2), 0);
                    for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                        const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;

                        theta_base *= theta_scale;
                        block_theta *= theta_scale;
@ -11196,6 +11218,7 @@ static void ggml_compute_forward_rope_f16(
                        rope_yarn(
                            theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                        );
+                        sin_theta *= sin_sign;

                        theta_base *= theta_scale;

@ -11222,6 +11245,7 @@ static void ggml_compute_forward_rope_f16(
                                theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                &cos_theta, &sin_theta
                            );
+                            sin_theta *= sin_sign;

                            theta_base *= theta_scale;

@ -11251,11 +11275,11 @@ static void ggml_compute_forward_rope(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
            } break;
        default:
            {
@ -11266,216 +11290,6 @@ static void ggml_compute_forward_rope(

 // ggml_compute_forward_rope_back

-static void ggml_compute_forward_rope_back_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-
-    float freq_base;
-    float freq_scale;
-
-    // these two only relevant for xPos RoPE:
-    float xpos_base;
-    bool xpos_down;
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(float));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = freq_scale * (float)p;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-
-                        // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-                        if (xpos_down) zeta = 1.0f / zeta;
-
-                        theta_base *= theta_scale;
-
-                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float dy0 = dy[0];
-                        const float dy1 = dy[1];
-
-                        dx[0] =   dy0*cos_theta*zeta + dy1*sin_theta*zeta;
-                        dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float dy0 = dy[0];
-                            const float dy1 = dy[n_dims/2];
-
-                            dx[0]        =   dy0*cos_theta + dy1*sin_theta;
-                            dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void ggml_compute_forward_rope_back_f16(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return;
-    }
-
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-
-    GGML_TENSOR_UNARY_OP_LOCALS
-
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-
-    assert(nb0 == sizeof(ggml_fp16_t));
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nr = ggml_nrows(dst);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    // row index used to determine which thread to use
-    int ir = 0;
-
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-
-    const bool is_neox = mode & 2;
-
-    const int32_t * pos = (const int32_t *) src1->data;
-
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-
-                float theta_base = (float)p;
-
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-
-                        theta_base *= theta_scale;
-
-                        const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                        const float dy0 = GGML_FP16_TO_FP32(dy[0]);
-                        const float dy1 = GGML_FP16_TO_FP32(dy[1]);
-
-                        dx[0] = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                        dx[1] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
-
-                            theta_base *= theta_scale;
-
-                            const int64_t i0 = ib*n_dims + ic/2;
-
-                            const ggml_fp16_t * const dy  = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  ggml_fp16_t *       dx  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-
-                            const float dy0 = GGML_FP16_TO_FP32(dy[0]);
-                            const float dy1 = GGML_FP16_TO_FP32(dy[n_dims/2]);
-
-                            dx[0]        = GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                            dx[n_dims/2] = GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 static void ggml_compute_forward_rope_back(
        const struct ggml_compute_params * params,
        const struct ggml_tensor * src0,
@ -11484,11 +11298,11 @@ static void ggml_compute_forward_rope_back(
    switch (src0->type) {
        case GGML_TYPE_F16:
            {
-                ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
+                ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
            } break;
        case GGML_TYPE_F32:
            {
-                ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
+                ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
            } break;
        default:
            {
@ -14926,14 +14740,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
                    const int mode       = ((int32_t *) tensor->op_params)[2];
                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@ -14943,8 +14760,13 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                n_dims,
                                mode,
                                n_ctx,
+                                n_orig_ctx,
                                freq_base,
                                freq_scale,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                xpos_base,
                                xpos_down),
                            zero_table);
@ -14957,14 +14779,17 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
                    const int mode       = ((int32_t *) tensor->op_params)[2];
                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));

                    src0->grad = ggml_add_or_set(ctx,
                            src0->grad,
@ -14973,14 +14798,14 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                src1,
                                n_dims,
                                mode,
-                                0,
                                n_ctx,
+                                n_orig_ctx,
                                freq_base,
                                freq_scale,
-                                0.0f,
-                                1.0f,
-                                0.0f,
-                                0.0f,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                xpos_base,
                                xpos_down,
                                false),
@ -18248,7 +18073,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    {
        ctx->kv = malloc(ctx->header.n_kv * sizeof(struct gguf_kv));

-        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
            struct gguf_kv * kv = &ctx->kv[i];

            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@ -18295,7 +18120,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
                            case GGUF_TYPE_STRING:
                                {
                                    kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct gguf_str));
-                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                        ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset);
                                    }
                                } break;
@ -18323,7 +18148,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    {
        ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct gguf_tensor_info));

-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            for (int j = 0; j < GGML_MAX_DIMS; ++j) {
@ -18370,7 +18195,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
    // compute the total size of the data section, taking into account the alignment
    {
        ctx->size = 0;
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            struct gguf_tensor_info * info = &ctx->infos[i];

            const int64_t ne =
@ -18439,7 +18264,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
        ggml_set_no_alloc(ctx_data, true);

        // create the tensors
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
            const int64_t ne[GGML_MAX_DIMS] = {
                ctx->infos[i].ne[0],
                ctx->infos[i].ne[1],
--- a/ggml.h
+++ b/ggml.h
@ -1371,8 +1371,13 @@ extern "C" {
            int                   n_dims,
            int                   mode,
            int                   n_ctx,
+            int                   n_orig_ctx,
            float                 freq_base,
            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
            float                 xpos_base,
            bool                  xpos_down);

--- a/models/README.md
+++ b/models/README.md
@ -39,19 +39,19 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main

 ## Available models

-| Model     | Disk   | Mem     | SHA                                        |
-| ---       | ---    | ---     | ---                                        |
-| tiny      |  75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
-| tiny.en   |  75 MB | ~390 MB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
-| base      | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
-| base.en   | 142 MB | ~500 MB | `137c40403d78fd54d454da0f9bd998f78703390c` |
-| small     | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
-| small.en  | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
-| medium    | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
-| large-v1  | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
-| large-v2  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
-| large     | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
+| Model     | Disk    | SHA                                        |
+| ---       | ---     | ---                                        |
+| tiny      |  75 MiB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
+| tiny.en   |  75 MiB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
+| base      | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
+| base.en   | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
+| small     | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
+| small.en  | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
+| medium    | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
+| medium.en | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
+| large-v1  | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
+| large-v2  | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large-v3  | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |

 ## Model files for testing purposes

@ -76,3 +76,27 @@ git clone https://huggingface.co/openai/whisper-medium
 # convert the model to ggml
 python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
 ```
+
+## Distilled models
+
+Initial support for https://huggingface.co/distil-whisper is available.
+
+Currently, the chunk-based transcription strategy is not implemented, so there can be sub-optimal quality when using the distilled models with `whisper.cpp`.
+
+```bash
+# clone OpenAI whisper and whisper.cpp
+git clone https://github.com/openai/whisper
+git clone https://github.com/ggerganov/whisper.cpp
+
+# get the models
+cd whisper.cpp/models
+git clone https://huggingface.co/distil-whisper/distil-medium.en
+git clone https://huggingface.co/distil-whisper/distil-large-v2
+
+# convert to ggml
+python3 ./convert-h5-to-ggml.py ./distil-medium.en/ ../../whisper .
+mv ggml-model.bin ggml-medium.en-distil.bin
+
+python3 ./convert-h5-to-ggml.py ./distil-large-v2/ ../../whisper .
+mv ggml-model.bin ggml-large-v2-distil.bin
+```
--- a/models/convert-h5-to-coreml.py
+++ b/models/convert-h5-to-coreml.py
@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
 # Ported from models/convert-whisper-to-coreml.py
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
+    parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
    parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
    parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
    parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
    parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
    args = parser.parse_args()

-    if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
+    if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
        raise ValueError("Invalid model name")

    pt_target_path = f"models/hf-{args.model_name}.pt"
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
    parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
    parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
    parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
    args = parser.parse_args()

-    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large", "large-v1", "large-v2"]:
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "small.en-tdrz", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
        raise ValueError("Invalid model name")

    whisper = load_model(args.model).cpu()
--- a/models/convert-whisper-to-openvino.py
+++ b/models/convert-whisper-to-openvino.py
@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large-v1, large-v2, large-v3)", required=True)
    args = parser.parse_args()

-    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large-v1", "large-v2", "large-v3"]:
        raise ValueError("Invalid model name")

    whisper = load_model(args.model).cpu()
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -19,7 +19,7 @@ function get_script_path() {
 models_path="$(get_script_path)"

 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )

 # list available models
 function list_models {
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -8,7 +8,7 @@ popd
 set argc=0
 for %%x in (%*) do set /A argc+=1

-set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
+set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3

 if %argc% neq 1 (
  echo.
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -42,7 +42,7 @@ models=(
    "medium.en-q5_0"
    "large-v1"
    "large-v2"
-    "large"
+    "large-v3"
    "large-q5_0"
 )

--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -19,7 +19,7 @@
 cd `dirname $0`

 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )

 # list available models
 function list_models {
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -50,7 +50,9 @@ extern "C" {
    //
    //     ...
    //
-    //     struct whisper_context * ctx = whisper_init_from_file("/path/to/ggml-base.en.bin");
+    //     whisper_context_params cparams = whisper_context_default_params();
+    //
+    //     struct whisper_context * ctx = whisper_init_from_file_with_params("/path/to/ggml-base.en.bin", cparams);
    //
    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
    //         fprintf(stderr, "failed to process audio\n");
@ -78,7 +80,9 @@ extern "C" {
    struct whisper_state;
    struct whisper_full_params;

-    typedef int whisper_token;
+    typedef int32_t whisper_pos;
+    typedef int32_t whisper_token;
+    typedef int32_t whisper_seq_id;

    struct whisper_context_params {
        bool  use_gpu;
@ -311,6 +315,9 @@ extern "C" {
    // Return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found
    WHISPER_API const char * whisper_lang_str(int id);

+    // Return the short string of the specified language name (e.g. 2 -> "german"), returns nullptr if not found
+    WHISPER_API const char * whisper_lang_str_full(int id);
+
    // Use mel data at offset_ms to try and auto-detect the spoken language
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first
    // Returns the top language id or negative on failure
Author	SHA1	Message	Date
Georgi Gerganov	c8b3bc6a0d	cuda : use CUBLAS_COMPTE_F32 insted of CUBLAS_COMPUTE_F16	2023-11-27 11:57:07 +02:00
bobqianic	f52e74d4dc	CI : Rectify the Clang-Related workflow issues (#1551 ) * fix bugs in workflow * fix missing clang in workflow * Update build.yml	2023-11-27 11:35:37 +02:00
Ismatulla Mansurov	23c21e92eb	server : automatically convert audio on the server (#1539 ) * server : automatically convert audio on the server * server : remove rebundant comments * server : automatic conversion refactor * server : update server readme * server : remove unnecessary comments and tabs * server : put back remove calling * server : apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server : check ffmpeg before the server lunch * server : fix indentation * Apply suggestions from code review Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * server : fix function typo calling * server : fix function typo calling * server : add warning in readme --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-11-27 11:28:34 +02:00
Georgi Gerganov	447d49530c	whisper : remove trailing whitespaces	2023-11-24 13:13:21 +02:00
Georgi Gerganov	9d6ebd877c	release : v1.5.1	2023-11-24 12:41:55 +02:00
Georgi Gerganov	0ba365f958	metal : add backend function to check device family support (#1547 )	2023-11-24 12:37:08 +02:00
Georgi Gerganov	010c8ec3ab	cuda : sync some minor stuff from llama.cpp (#1548 )	2023-11-24 12:36:21 +02:00
Georgi Gerganov	ffdb5c4735	whisper : fix typo	2023-11-24 09:45:10 +02:00
ecneladis	a5881d619c	server : add --print-realtime param (#1541 ) * server : add --print-realtime param * Fix duplicate realtime output	2023-11-24 09:35:02 +02:00
bradmit	34f70b3a56	whisper : add whisper_lang_str_full (#1546 ) * Update whisper.h add whisper_lang_fullstr to retrieve the full language name * Update whisper.cpp add whisper_lang_fullstr to return the full language name * fullstr -> str_full --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-11-24 09:33:13 +02:00
Okabintaro	8328d1900f	fix(server): typo in temperature parameter (#1545 ) Also fixed another typo in comments.	2023-11-23 20:59:36 +02:00
sandrohanea	d2bd5f0bdc	metal : fix build (#1544 )	2023-11-23 20:20:53 +02:00
Georgi Gerganov	34209a37a2	readme : add server example	2023-11-23 17:20:33 +02:00
Gleicon Moraes	180e062eda	go : fixed Makefile for MacOS ARM 64 (#1530 ) * Fixed Makefile for MacOS ARM 64 based on https://github.com/ggerganov/whisper.cpp/issues/1344 + proper ggml-metal env var setting * conditional to fix broken non-macos compilation * spaces -> tab * make : fix whitespaces --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-11-22 18:08:11 +02:00
Felix	5c7be85fdc	Change temp file name for server application (#1535 ) Avoid issue of removing file if it exists in the current working directory	2023-11-22 09:23:36 +01:00
Georgi Gerganov	146169ec38	bench : pass memcpy threads from cli	2023-11-21 22:27:22 +02:00
Georgi Gerganov	9befab5ab9	bench : multi-thread memcpy (#1534 )	2023-11-21 22:07:30 +02:00
Felix	9ac88f2b57	Close file after writing in server application (#1533 ) Fix of mistake leaving file open while reading it again as wav	2023-11-21 20:36:10 +01:00
Georgi Gerganov	46f5b6cb08	server : add video to readme	2023-11-21 17:30:43 +02:00
Felix	eff3570f78	server : add a REST Whisper server example with OAI-like API (#1380 ) * Add first draft of server * Added json support and base funcs for server.cpp * Add more user input via api-request also some clean up * Add reqest params and load post function Also some general clean up * Remove unused function * Add readme * Add exception handlers * Update examples/server/server.cpp * make : add server target * Add magic curl syntax Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-11-20 21:40:24 +02:00
M. A. Ali	fa19bc4195	whisper : update example in whisper.h (#1529 ) update the example in the header, previous examples deprecated.	2023-11-20 20:52:27 +02:00
Georgi Gerganov	a01b2e0971	sdl : fix audio callback (#1523 )	2023-11-20 13:16:38 +02:00
Georgi Gerganov	8159a9ab99	whisper : reuse whisper_decode_with_state (#1521 )	2023-11-20 13:16:11 +02:00
Tamotsu Takahashi	7516d9c16d	ci : redistribute CUDA DLLs (#1522 ) see https://docs.nvidia.com/cuda/eula/index.html#attachment-a	2023-11-19 12:43:22 +02:00
sandrohanea	46cc26d1b9	whisper : fix with_state methods to use the correct state (#1519 ) Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com>	2023-11-19 11:25:30 +02:00
Georgi Gerganov	f784f9fa12	whisper : fix overriding the audio context	2023-11-19 10:32:32 +02:00
Georgi Gerganov	ca23f8ee6d	cuda : assert ggml_add sources to be contiguous	2023-11-19 10:32:08 +02:00
Georgi Gerganov	e2f0eba2d4	ios : sync submodule	2023-11-17 10:42:04 +02:00
Georgi Gerganov	d4353e48f7	sync : ggml (ggml-alloc + linker + gguf fixes) (#1501 )	2023-11-17 10:00:07 +02:00
Georgi Gerganov	bebf0da983	quantize : add support for K-quant types	2023-11-16 16:18:24 +02:00
Georgi Gerganov	848e54f3ad	bench : fix memcpy bench size	2023-11-16 10:59:32 +02:00
Sam Pullara	7883d1cae4	talk-llama : improve quote and backtick handling (#1364 ) * ISSUE-1329: replace " with ' so it doesn't try to execute code in backticks. * Typo * Update to keep possessives in the output Closes the ' then puts a ' in quotes then reopens the ' to escape the ' characters.	2023-11-16 10:34:05 +02:00
Georgi Gerganov	ccc85b4ff8	talk-llama : enable GPU by default	2023-11-15 21:33:00 +02:00
Georgi Gerganov	c7606b47df	models : add info about distilled models	2023-11-15 21:10:13 +02:00
Georgi Gerganov	d38af151a1	release : v1.5.0	2023-11-15 21:02:52 +02:00
Georgi Gerganov	94267df08e	bench-all : add distil models	2023-11-15 20:49:12 +02:00
Georgi Gerganov	8713c67133	js : latest whisper.js	2023-11-15 20:10:16 +02:00
Georgi Gerganov	57a60639bb	bench-all : indentations	2023-11-15 20:01:15 +02:00
Georgi Gerganov	bfbaa4dce5	whisper : make large version explicit + fix data size units (#1493 )	2023-11-15 19:42:25 +02:00
Georgi Gerganov	1d79e78402	java : fix test (#1492 )	2023-11-15 17:42:53 +02:00
Georgi Gerganov	b6c5f49b78	whisper : add batched decoding (#1486 ) * whisper : add whisper_batch * whisper : move kv_self to whisper_state * whisper : full batched decoding support * whisper : fix memory leak in whisper_batch * whisper : fix mem leak again + remove oboslete function * whisper : clear kv cache when using whisper_decode API * whisper : speed-up sampling * whisper : fix decoders initializer * bench : add batch size 5 bench * whisper : add comment about the KV cache size * whisper : add check for max number of decoders * whisper : avoid starting sampling threads with bs=1 * whisper : enable beam-search by default * cuda : sync llama.cpp fixes	2023-11-15 16:12:52 +02:00