whisper : alternative way to handle the external encoders

whisper : fix usage of extenral encoders (e.g. CoreML)
talk-llama : sync llama.cpp
2025-07-01 23:10:47 +02:00 · 2024-02-12 16:32:26 +02:00 · 2024-02-12 15:21:21 +02:00 · 2024-02-12 10:39:58 +02:00 · 2024-02-12 09:32:15 +02:00 · 2024-02-12 09:31:12 +02:00
89 changed files with 44126 additions and 8731 deletions
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -0,0 +1,38 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.3.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+WORKDIR /app
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV WHISPER_CUBLAS=1
+
+RUN apt-get update && \
+    apt-get install -y build-essential \
+    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+# Ref: https://stackoverflow.com/a/53464012
+ENV CUDA_MAIN_VERSION=12.3
+ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
+
+COPY .. .
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+WORKDIR /app
+
+RUN apt-get update && \
+  apt-get install -y curl ffmpeg \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY --from=build /app /app
+ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -0,0 +1,19 @@
+FROM ubuntu:22.04 AS build
+WORKDIR /app
+
+RUN apt-get update && \
+  apt-get install -y build-essential \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY .. .
+RUN make
+
+FROM ubuntu:22.04 AS runtime
+WORKDIR /app
+
+RUN apt-get update && \
+  apt-get install -y curl ffmpeg \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY --from=build /app /app
+ENTRYPOINT [ "bash", "-c" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -117,7 +117,6 @@ jobs:
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            apt update
-            apt install -y clang
            apt install -y clang build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
            make
@ -167,7 +166,7 @@ jobs:
            s2arc: x64
            jnaPath: win32-x86-64
          - sdl2: ON
-            s2ver: 2.26.0
+            s2ver: 2.28.5

    steps:
      - name: Clone
@ -224,11 +223,14 @@ jobs:
          - arch: Win32
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
            s2arc: x86
+            clblast: OFF
          - arch: x64
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
            s2arc: x64
+            clblast: ON
+            clver: 1.6.1
          - sdl2: ON
-            s2ver: 2.26.0
+            s2ver: 2.28.5

    steps:
      - name: Clone
@ -253,6 +255,18 @@ jobs:
          7z x sdl2.zip
          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV

+      - name: Install OpenCL
+        if: matrix.clblast == 'ON'
+        run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
+
+      - name: Fetch CLBlast and set CLBlast_DIR
+        if: matrix.clblast == 'ON'
+        run: |
+          C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
+          7z x clblast.zip
+          7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
+          echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
+
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
@ -260,6 +274,7 @@ jobs:
          -DWHISPER_OPENBLAS=${{ matrix.blas }}
          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_CLBLAST=${{ matrix.clblast }}

      - name: Build
        run: |
@ -274,11 +289,15 @@ jobs:
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

+      - name: Copy clblast.dll
+        if: matrix.clblast == 'ON'
+        run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
+
      - name: Upload binaries
        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v1
        with:
-          name: whisper-blas-bin-${{ matrix.arch }}
+          name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}

  windows-cublas:
@ -295,7 +314,7 @@ jobs:
          - arch: x64
            s2arc: x64
          - sdl2: ON
-            s2ver: 2.26.0
+            s2ver: 2.28.5

    steps:
      - name: Clone
@ -321,7 +340,8 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_CUBLAS=1
+          -DWHISPER_CUBLAS=${{ matrix.cublas }}
+          -DWHISPER_SDL2=${{ matrix.sdl2 }}

      - name: Build ${{ matrix.cuda-toolkit }}
        run: |
@ -396,6 +416,14 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v3
+        with:
+          path: whisper
+
+      - name: Clone
+        uses: actions/checkout@v3
+        with:
+          repository: ggerganov/ggml
+          path: ggml

      - name: Install Java
        uses: actions/setup-java@v3
@ -408,9 +436,15 @@ jobs:

      - name: Build
        run: |
-          cd examples/whisper.android
+          cd whisper/examples/whisper.android
          ./gradlew assembleRelease --no-daemon

+      - name: Build with external ggml
+        run: |
+          export PATH_TO_GGML=$PWD/ggml
+          cd whisper/examples/whisper.android
+          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
+
  android_java:
    runs-on: ubuntu-latest

--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -0,0 +1,57 @@
+name: Publish Docker image
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    if: github.event.pull_request.draft == false
+
+    runs-on: ubuntu-latest
+    env:
+      COMMIT_SHA: ${{ github.sha }}
+    strategy:
+      matrix:
+        config:
+          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
+          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image (versioned)
+        if: github.event_name == 'push'
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          file: ${{ matrix.config.dockerfile }}
+
+      - name: Build and push Docker image (tagged)
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: ${{ github.event_name == 'push' }}
+          platforms: ${{ matrix.config.platforms }}
+          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
+          file: ${{ matrix.config.dockerfile }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.5)

-project(whisper.cpp VERSION 1.5.2)
+project(whisper.cpp VERSION 1.5.4)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -68,6 +68,7 @@ if (APPLE)
    option(WHISPER_METAL_NDEBUG          "whisper: disable Metal debugging"      OFF)
    option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
+    option(WHISPER_METAL_EMBED_LIBRARY   "whisper: embed Metal library"          OFF)
 else()
    option(WHISPER_BLAS                  "whisper: use BLAS libraries"  OFF)
    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor" Generic)
@ -147,6 +148,30 @@ if (APPLE)

        # copy ggml-metal.metal to bin directory
        configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+
+        if (WHISPER_METAL_EMBED_LIBRARY)
+            enable_language(ASM)
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_EMBED_LIBRARY)
+
+            set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
+
+            file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
+            set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
+
+            add_custom_command(
+                OUTPUT ${EMBED_METALLIB_ASSEMBLY}
+                COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
+                COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
+                COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
+                COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
+                COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
+                COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
+                DEPENDS ${METALLIB_SOURCE}
+                COMMENT "Generate assembly for embedded Metal library"
+            )
+
+            set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
+        endif()
    endif()

    if (WHISPER_COREML)
@ -218,11 +243,17 @@ if (WHISPER_CUBLAS)
        add_compile_definitions(GGML_USE_CUBLAS)

        if (WHISPER_STATIC)
-            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            if (WIN32)
+                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
+                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+            else ()
+                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            endif()
        else()
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

+        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
    else()
        message(FATAL_ERROR "cuBLAS not found")
    endif()
@ -309,7 +340,8 @@ if (WHISPER_ALL_WARNINGS)
 endif()

 if (NOT MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
+    # TODO: temporary disabled until we figure out ggml-metal.m
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
 endif()

@ -338,8 +370,8 @@ else()
        endif()
    else()
        if (EMSCRIPTEN)
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
        else()
            if(NOT WHISPER_NO_AVX)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
@ -498,6 +530,7 @@ else()
 endif()

 if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_link_libraries(${TARGET} PUBLIC
        ${CMAKE_DL_LIBS}
        )
@ -521,7 +554,13 @@ endif()

 if (GGML_SOURCES_CUDA)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
+    # Only configure gmml CUDA architectures is not globally set
+    if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
+        # Not overriden by user, so set defaults
+        set(GGML_CUDA_ARCHITECTURES 52 61 70)
+    endif()
+    message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
+    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
 endif()

--- a/44
+++ b/44
@ -42,6 +42,12 @@ CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

+ifdef MACOSX_DEPLOYMENT_TARGET
+	CFLAGS   += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
+	CXXFLAGS += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
+	LDFLAGS  += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
+endif
+
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
@ -99,6 +105,16 @@ ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$
 	CXXFLAGS += -pthread
 endif

+# detect Windows
+ifneq ($(findstring _NT,$(UNAME_S)),)
+	_WIN32 := 1
+endif
+
+# Windows Sockets 2 (Winsock) for network-capable apps
+ifeq ($(_WIN32),1)
+	LWINSOCK2 := -lws2_32
+endif
+
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@ -107,7 +123,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 		CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
 	else ifeq ($(UNAME_S),Linux)
 		CPUINFO_CMD := cat /proc/cpuinfo
-	else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
+	else ifneq (,$(filter MINGW32_NT% MINGW64_NT% MSYS_NT%,$(UNAME_S)))
 		CPUINFO_CMD := cat /proc/cpuinfo
 	else ifneq (,$(filter DragonFly FreeBSD,$(UNAME_S)))
 		CPUINFO_CMD := grep Features /var/run/dmesg.boot
@ -199,14 +215,14 @@ endif

 ifdef WHISPER_CUBLAS
 	ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
-		CUDA_ARCH_FLAG=native
+		CUDA_ARCH_FLAG ?= native
 	else
-		CUDA_ARCH_FLAG=all
+		CUDA_ARCH_FLAG ?= all
 	endif

 	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
+	LDFLAGS     += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
 	WHISPER_OBJ += ggml-cuda.o
 	NVCC        = nvcc
 	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
@ -329,6 +345,24 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@

 WHISPER_OBJ += ggml-metal.o
+
+ifdef WHISPER_METAL_EMBED_LIBRARY
+CFLAGS += -DGGML_METAL_EMBED_LIBRARY
+
+ggml-metal-embed.o: ggml-metal.metal
+	@echo "Embedding Metal library"
+	$(eval TEMP_ASSEMBLY=$(shell mktemp))
+	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
+	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
+	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
+	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
+	@$(AS) $(TEMP_ASSEMBLY) -o $@
+	@rm -f ${TEMP_ASSEMBLY}
+
+WHISPER_OBJ += ggml-metal-embed.o
+endif
 endif

 libwhisper.a: $(WHISPER_OBJ)
@ -360,7 +394,7 @@ quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)

 server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS) $(LWINSOCK2)

 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -13,9 +13,13 @@ let package = Package(
    products: [
        .library(name: "whisper", targets: ["whisper"]),
    ],
+    dependencies: [
+        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
+    ],
    targets: [
        .target(
            name: "whisper",
+            dependencies: ["ggml"],
            path: ".",
            exclude: [
               "bindings",
@ -32,14 +36,8 @@ let package = Package(
               "Makefile"
            ],
            sources: [
-                "ggml.c",
                "whisper.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m"
            ],
-            resources: [.process("ggml-metal.metal")],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.5.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.5.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -33,9 +33,10 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
+- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)

 The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
-The rest of the code is part of the [ggml](https://github.com/ggerganov/ggml) machine learning library.
+The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.

 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
 As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
@ -60,22 +61,22 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder

-The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
-intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
-the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
+The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.

 ## Quick start

-First clone the repository.
+First clone the repository:

-Then, download one of the Whisper models converted in [ggml format](models). For example:
+```bash
+git clone https://github.com/ggerganov/whisper.cpp.git
+```
+
+Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:

 ```bash
 bash ./models/download-ggml-model.sh base.en
 ```

-If you wish to convert the Whisper models to ggml format yourself, instructions are in [models/README.md](models/README.md).
-
 Now build the [main](examples/main) example and transcribe an audio file like this:

 ```bash
@ -90,7 +91,7 @@ make

 For a quick demo, simply run `make base.en`:

-```java
+```text
 $ make base.en

 cc  -I.              -O3 -std=c11   -pthread -DGGML_USE_ACCELERATE   -c ggml.c -o ggml.o
@ -206,7 +207,7 @@ For detailed usage instructions, run: `./main -h`
 Note that the [main](examples/main) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
 For example, you can use `ffmpeg` like this:

-```java
+```bash
 ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
 ```

@ -238,9 +239,9 @@ make large-v3

 ## Memory usage

-| Model  | Disk    | Mem      |
-| ---    | ---     | ---      |
-| tiny   |  75 MiB | ~273 MB |
+| Model  | Disk    | Mem     |
+| ------ | ------- | ------- |
+| tiny   | 75 MiB  | ~273 MB |
 | base   | 142 MiB | ~388 MB |
 | small  | 466 MiB | ~852 MB |
 | medium | 1.5 GiB | ~2.1 GB |
@ -277,7 +278,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

  - To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
  - Python 3.10 is recommended.
-  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html)  for this step:
+  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
    - To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
    - To activate the environment, use: `conda activate py310-whisper`

@ -303,8 +304,8 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

 - Run the examples as usual. For example:

-  ```bash
-  ./main -m models/ggml-base.en.bin -f samples/jfk.wav
+  ```text
+  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav

  ...

@ -332,7 +333,8 @@ This can result in significant speedup in encoder performance. Here are the inst
 - First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.

  Windows:
-  ```
+
+  ```powershell
  cd models
  python -m venv openvino_conv_env
  openvino_conv_env\Scripts\activate
@ -341,7 +343,8 @@ This can result in significant speedup in encoder performance. Here are the inst
  ```

  Linux and macOS:
-  ```
+
+  ```bash
  cd models
  python3 -m venv openvino_conv_env
  source openvino_conv_env/bin/activate
@ -355,7 +358,7 @@ This can result in significant speedup in encoder performance. Here are the inst
  python convert-whisper-to-openvino.py --model base.en
  ```

-  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as ggml models, as that
+  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as `ggml` models, as that
  is the default location that the OpenVINO extension will search at runtime.

 - Build `whisper.cpp` with OpenVINO support:
@ -365,24 +368,28 @@ This can result in significant speedup in encoder performance. Here are the inst
  After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:

  Linux:
+
  ```bash
  source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
  ```

  Windows (cmd):
-  ```
+
+  ```powershell
  C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
  ```

  And then build the project using cmake:
+
  ```bash
  cmake -B build -DWHISPER_OPENVINO=1
  cmake --build build -j --config Release
  ```

 - Run the examples as usual. For example:
-  ```bash
-  ./main -m models/ggml-base.en.bin -f samples/jfk.wav
+
+  ```text
+  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav

  ...

@ -433,7 +440,6 @@ cmake -B build -DWHISPER_CLBLAST=ON
 cmake --build build -j --config Release
 ```

-
 Run all the examples as usual.

 ## BLAS CPU support via OpenBLAS
@ -448,6 +454,38 @@ make clean
 WHISPER_OPENBLAS=1 make -j
 ```

+## Docker
+
+### Prerequisites
+
+- Docker must be installed and running on your system.
+- Create a folder to store big models & intermediate files (ex. /whisper/models)
+
+### Images
+
+We have two Docker images available for this project:
+
+1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
+
+### Usage
+
+```shell
+# download model and persist it in a local folder
+docker run -it --rm \
+  -v path/to/models:/models \
+  whisper.cpp:main "./models/download-ggml-model.sh base /models"
+# transcribe an audio file
+docker run -it --rm \
+  -v path/to/models:/models \
+  -v path/to/audios:/audios \
+  whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
+# transcribe an audio file in samples folder
+docker run -it --rm \
+  -v path/to/models:/models \
+  whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
+```
+
 ## Limitations

 - Inference only
@ -460,7 +498,7 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
 <details>
  <summary>Expand to see the result</summary>

-```java
+```text
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8

 whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
@ -532,6 +570,7 @@ whisper_print_timings:   encode time = 18665.10 ms /     9 runs ( 2073.90 ms per
 whisper_print_timings:   decode time = 13090.93 ms /   549 runs (   23.85 ms per run)
 whisper_print_timings:    total time = 32733.52 ms
 ```
+
 </details>

 ## Real-time audio input example
@ -540,7 +579,7 @@ This is a naive example of performing real-time inference on audio from your mic
 The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

-```java
+```bash
 make stream
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```
@ -552,7 +591,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
 Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy
 to highlight words with high or low confidence:

-```java
+```bash
 ./main -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
 ```

@ -562,8 +601,8 @@ to highlight words with high or low confidence:

 For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:

-```java
-./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
+```text
+$ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
@ -586,8 +625,8 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr

 The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:

-```java
-./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
+```text
+$ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
@ -657,7 +696,7 @@ This requires to have `ffmpeg` installed.

 Here are a few *"typical"* examples:

-```java
+```bash
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
 source ./samples/jfk.wav.wts
 ffplay ./samples/jfk.wav.mp4
@ -667,7 +706,7 @@ https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b

 ---

-```java
+```bash
 ./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
 source ./samples/mm0.wav.wts
 ffplay ./samples/mm0.wav.mp4
@ -677,7 +716,7 @@ https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-9

 ---

-```java
+```bash
 ./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
 source ./samples/gb0.wav.wts
 ffplay ./samples/gb0.wav.mp4
@ -691,7 +730,7 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a

 Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:

-```java
+```bash
 ./extra/bench-wts.sh samples/jfk.wav
 ffplay ./samples/jfk.wav.all.mp4
 ```
@ -720,8 +759,7 @@ It is written in python with the intention of being easy to modify and extend fo

 It outputs a csv file with the results of the benchmarking.

-
-## ggml format
+## `ggml` format

 The original models are converted to a custom binary format. This allows to pack everything needed into a single file:

@ -736,51 +774,50 @@ or manually from here:
 - https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

-For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
-in [models](models).
+For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).

 ## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)

- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
- [X] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
+- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
+- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [X] Java:
+- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
+- [x] Java:
  - [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
+- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
+- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
  - [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
+- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
+- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
- [X] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
+- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
+- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)

 ## Examples

 There are various examples of using the library for different projects in the [examples](examples) folder.
 Some of the examples are even ported to run in the browser using WebAssembly. Check them out!

-| Example | Web | Description |
-| ---     | --- | ---         |
-| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
-| [bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
-| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
-| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
-| [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
-| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
-| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
-| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
-| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
-| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
-| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
-| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
-| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
-| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
-| [server](examples/server) | | HTTP transcription server with OAI-like API |
+| Example                                             | Web                                   | Description                                                                                                                     |
+| --------------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| [main](examples/main)                               | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper                                                                       |
+| [bench](examples/bench)                             | [bench.wasm](examples/bench.wasm)     | Benchmark the performance of Whisper on your machine                                                                            |
+| [stream](examples/stream)                           | [stream.wasm](examples/stream.wasm)   | Real-time transcription of raw microphone capture                                                                               |
+| [command](examples/command)                         | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic                                                         |
+| [wchess](examples/wchess)                           | [wchess.wasm](examples/wchess)        | Voice-controlled chess                                                                                                          |
+| [talk](examples/talk)                               | [talk.wasm](examples/talk.wasm)       | Talk with a GPT-2 bot                                                                                                           |
+| [talk-llama](examples/talk-llama)                   |                                       | Talk with a LLaMA bot                                                                                                           |
+| [whisper.objc](examples/whisper.objc)               |                                       | iOS mobile application using whisper.cpp                                                                                        |
+| [whisper.swiftui](examples/whisper.swiftui)         |                                       | SwiftUI iOS / macOS application using whisper.cpp                                                                               |
+| [whisper.android](examples/whisper.android)         |                                       | Android mobile application using whisper.cpp                                                                                    |
+| [whisper.nvim](examples/whisper.nvim)               |                                       | Speech-to-text plugin for Neovim                                                                                                |
+| [generate-karaoke.sh](examples/generate-karaoke.sh) |                                       | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture                           |
+| [livestream.sh](examples/livestream.sh)             |                                       | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185)                                           |
+| [yt-wsp.sh](examples/yt-wsp.sh)                     |                                       | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
+| [server](examples/server)                           |                                       | HTTP transcription server with OAI-like API                                                                                     |

 ## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)

--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -123,6 +123,11 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }

+// Set initial prompt
+func (p *Params) SetInitialPrompt(prompt string) {
+	p.initial_prompt = C.CString(prompt)
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS

@ -147,6 +152,7 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
+	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
 	if p.translate {
 		str += " translate"
 	}
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -130,6 +130,11 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }

+// Set initial prompt
+func (context *context) SetInitialPrompt(prompt string) {
+	context.params.SetInitialPrompt(prompt)
+}
+
 // ResetTimings resets the mode timings. Should be called before processing
 func (context *context) ResetTimings() {
 	context.model.ctx.Whisper_reset_timings()
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -38,17 +38,18 @@ type Context interface {
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language

-	SetOffset(time.Duration)      // Set offset
-	SetDuration(time.Duration)    // Set duration
-	SetThreads(uint)              // Set number of threads to use
-	SetSpeedup(bool)              // Set speedup flag
-	SetSplitOnWord(bool)          // Set split on word flag
-	SetTokenThreshold(float32)    // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)     // Set max segment length in characters
-	SetTokenTimestamps(bool)      // Set token timestamps flag
-	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)
-	SetAudioCtx(uint)             // Set audio encoder context
+	SetOffset(time.Duration)        // Set offset
+	SetDuration(time.Duration)      // Set duration
+	SetThreads(uint)                // Set number of threads to use
+	SetSpeedup(bool)                // Set speedup flag
+	SetSplitOnWord(bool)            // Set split on word flag
+	SetTokenThreshold(float32)      // Set timestamp token probability threshold
+	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
+	SetMaxSegmentLength(uint)       // Set max segment length in characters
+	SetTokenTimestamps(bool)        // Set token timestamps flag
+	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
+	SetAudioCtx(uint)               // Set audio encoder context
+	SetInitialPrompt(prompt string) // Set initial prompt

 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/README.md
+++ b/bindings/javascript/README.md
@ -41,7 +41,7 @@ make publish-npm

 ## Sample run

-```java
+```text
 $ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js

 whisper_model_load: loading model from 'whisper.bin'
@ -63,7 +63,7 @@ whisper_model_load: ggml ctx size =  140.60 MB
 whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB

-system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 | 
+system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |

 operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...

--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.5.2",
+  "version": "1.5.4",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@ -70,7 +70,7 @@ extern "C" {
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

        // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
    backend->iface.graph_plan_compute(backend, plan);
 }

-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return backend->iface.graph_compute(backend, cgraph);
 }

 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@ -52,7 +52,7 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -24,9 +24,9 @@ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {

    // select which device to run the Core ML model on
    MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
-    config.computeUnits = MLComputeUnitsCPUAndGPU;
+    // config.computeUnits = MLComputeUnitsCPUAndGPU;
    //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
-    //config.computeUnits = MLComputeUnitsAll;
+    config.computeUnits = MLComputeUnitsAll;

    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -14,6 +14,10 @@ if (WHISPER_SDL2)
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()

+if (WHISPER_CLBLAST)
+    find_package(CLBlast REQUIRED)
+endif()
+
 # common

 set(TARGET common)
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -154,7 +154,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

    // whisper init

-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -58,7 +58,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
 int whisper_bench_full(const whisper_params & params) {
    // whisper init

-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -693,7 +693,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -62,6 +62,9 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+        case GGML_FTYPE_MOSTLY_IQ2_XXS:
+        case GGML_FTYPE_MOSTLY_IQ2_XS:
+        case GGML_FTYPE_MOSTLY_IQ3_XXS:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -182,7 +185,7 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_Q5_K:
                case GGML_TYPE_Q6_K:
                    {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
+                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
@ -191,6 +194,9 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_I32:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_Q8_K:
+                case GGML_TYPE_IQ2_XXS:
+                case GGML_TYPE_IQ2_XS:
+                case GGML_TYPE_IQ3_XXS:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -615,6 +615,21 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(

 }

+bool is_wav_buffer(const std::string buf) {
+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
+        return false;
+    }
+
+    uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
+    if (chunk_size + 8 != buf.size()) {
+        return false;
+    }
+
+    return true;
+}
+
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin
@ -639,6 +654,12 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector

        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
    }
+    else if (is_wav_buffer(fname)) {
+        if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
+            fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
+            return false;
+        }
+    }
    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
        return false;
--- a/examples/common.h
+++ b/examples/common.h
@ -135,7 +135,11 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 // Audio utils
 //

+// Check if a buffer is a WAV audio file
+bool is_wav_buffer(const std::string buf);
+
 // Read WAV audio file and store the PCM data into pcmf32
+// fname can be a buffer of WAV data instead of a filename
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
 bool read_wav(
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -435,7 +435,7 @@ int main(int argc, char ** argv) {
    }

    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    // init audio
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -64,6 +64,7 @@ struct whisper_params {
    int32_t max_len      =  0;
    int32_t best_of      = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
    int32_t beam_size    = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
+    int32_t audio_ctx   = 0;

    float word_thold    =  0.01f;
    float entropy_thold =  2.40f;
@ -85,6 +86,7 @@ struct whisper_params {
    bool output_jsn      = false;
    bool output_jsn_full = false;
    bool output_lrc      = false;
+    bool no_prints       = false;
    bool print_special   = false;
    bool print_colors    = false;
    bool print_progress  = false;
@ -135,6 +137,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
+        else if (arg == "-ac"   || arg == "--audio-context")   { params.audio_ctx       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
@ -155,6 +158,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
        else if (arg == "-ojf"  || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
        else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(argv[++i]); }
+        else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints       = true; }
        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
@ -193,6 +197,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
+    fprintf(stderr, "  -ac N,     --audio-ctx N       [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
@ -212,6 +217,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -ojf,      --output-json-full  [%-7s] include more information in the JSON file\n",      params.output_jsn_full ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
+    fprintf(stderr, "  -np,       --no-prints         [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
@ -852,6 +858,9 @@ bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

+
+void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
+
 int main(int argc, char ** argv) {
    whisper_params params;

@ -878,9 +887,13 @@ int main(int argc, char ** argv) {
        exit(0);
    }

+    if (params.no_prints) {
+        whisper_log_set(cb_log_disable, NULL);
+    }
+
    // whisper init

-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -905,26 +918,25 @@ int main(int argc, char ** argv) {
            continue;
        }

-        // print system information
-        {
+        if (!whisper_is_multilingual(ctx)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        if (params.detect_language) {
+            params.language = "auto";
+        }
+
+        if (!params.no_prints) {
+            // print system information
            fprintf(stderr, "\n");
            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
-        }

-        // print some info about the processing
-        {
+            // print some info about the processing
            fprintf(stderr, "\n");
-            if (!whisper_is_multilingual(ctx)) {
-                if (params.language != "en" || params.translate) {
-                    params.language = "en";
-                    params.translate = false;
-                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-                }
-            }
-            if (params.detect_language) {
-                params.language = "auto";
-            }
            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, %d beams + best of %d, lang = %s, task = %s, %stimestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, params.n_processors, params.beam_size, params.best_of,
@ -958,6 +970,7 @@ int main(int argc, char ** argv) {
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
+            wparams.audio_ctx        = params.audio_ctx;

            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;
@ -973,6 +986,8 @@ int main(int argc, char ** argv) {
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;

+            wparams.no_timestamps    = params.no_timestamps;
+
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

            // this callback is called on each new segment
--- a/examples/python/test_whisper_processor.py
+++ b/examples/python/test_whisper_processor.py
@ -0,0 +1,7 @@
+import whisper_processor
+
+try:
+    result = whisper_processor.process_audio("./audio/wake_word_detected16k.wav", "base.en")
+    print(result)
+except Exception as e:
+    print(f"Error: {e}")
--- a/examples/python/whisper_processor.py
+++ b/examples/python/whisper_processor.py
@ -0,0 +1,54 @@
+import subprocess
+import sys
+import os
+
+def process_audio(wav_file, model_name="base.en"):
+    """
+    Processes an audio file using a specified model and returns the processed string.
+
+    :param wav_file: Path to the WAV file
+    :param model_name: Name of the model to use
+    :return: Processed string output from the audio processing
+    :raises: Exception if an error occurs during processing
+    """
+
+    model = f"./models/ggml-{model_name}.bin"
+
+    # Check if the file exists
+    if not os.path.exists(model):
+        raise FileNotFoundError(f"Model file not found: {model} \n\nDownload a model with this command:\n\n> bash ./models/download-ggml-model.sh {model_name}\n\n")
+
+    if not os.path.exists(wav_file):
+        raise FileNotFoundError(f"WAV file not found: {wav_file}")
+
+    full_command = f"./main -m {model} -f {wav_file} -np -nt"
+
+    # Execute the command
+    process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    # Get the output and error (if any)
+    output, error = process.communicate()
+
+    if error:
+        raise Exception(f"Error processing audio: {error.decode('utf-8')}")
+
+    # Process and return the output string
+    decoded_str = output.decode('utf-8').strip()
+    processed_str = decoded_str.replace('[BLANK_AUDIO]', '').strip()
+
+    return processed_str
+
+def main():
+    if len(sys.argv) >= 2:
+        wav_file = sys.argv[1]
+        model_name = sys.argv[2] if len(sys.argv) == 3 else "base.en"
+        try:
+            result = process_audio(wav_file, model_name)
+            print(result)
+        except Exception as e:
+            print(f"Error: {e}")
+    else:
+        print("Usage: python whisper_processor.py <wav_file> [<model_name>]")
+
+if __name__ == "__main__":
+    main()
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -5,8 +5,6 @@ include(DefaultTargetOptions)

 target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})

-# Check if the compiler is MinGW
-if(MINGW)
-    # Link the necessary libraries for SSL and Winsock
-    target_link_libraries(${TARGET} PRIVATE -lcrypt32 -lssl -lcrypto -lws2_32)
+if (WIN32)
+    target_link_libraries(${TARGET} PRIVATE ws2_32)
 endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -46,7 +46,7 @@ options:
  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
 ```

-> [!WARNING]  
+> [!WARNING]
 > **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**

 ## request examples
@ -56,8 +56,9 @@ options:
 curl 127.0.0.1:8080/inference \
 -H "Content-Type: multipart/form-data" \
 -F file="@<file-path>" \
-F temperature="0.2" \
-F response-format="json"
+-F temperature="0.0" \
+-F temperature_inc="0.2" \
+-F response_format="json"
 ```

 **/load**
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -18,7 +18,7 @@
 #endif

 using namespace httplib;
-using json = nlohmann::json;
+using json = nlohmann::ordered_json;

 namespace {

@ -40,30 +40,33 @@ struct server_params
 {
    std::string hostname = "127.0.0.1";
    std::string public_path = "examples/server/public";
+    std::string request_path = "";

    int32_t port          = 8080;
    int32_t read_timeout  = 600;
    int32_t write_timeout = 600;
-    
+
    bool ffmpeg_converter = false;
 };

 struct whisper_params {
-    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors =  1;
-    int32_t offset_t_ms  =  0;
-    int32_t offset_n     =  0;
-    int32_t duration_ms  =  0;
-    int32_t progress_step =  5;
-    int32_t max_context  = -1;
-    int32_t max_len      =  0;
-    int32_t best_of      =  2;
-    int32_t beam_size    = -1;
+    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_processors  = 1;
+    int32_t offset_t_ms   = 0;
+    int32_t offset_n      = 0;
+    int32_t duration_ms   = 0;
+    int32_t progress_step = 5;
+    int32_t max_context   = -1;
+    int32_t max_len       = 0;
+    int32_t best_of       = 2;
+    int32_t beam_size     = -1;
+    int32_t audio_ctx     = 0;

-    float word_thold    =  0.01f;
-    float entropy_thold =  2.40f;
-    float logprob_thold = -1.00f;
-    float userdef_temp  =  0.20f;
+    float word_thold      =  0.01f;
+    float entropy_thold   =  2.40f;
+    float logprob_thold   = -1.00f;
+    float temperature     =  0.00f;
+    float temperature_inc =  0.20f;

    bool speed_up        = false;
    bool debug_mode      = false;
@ -120,8 +123,7 @@ bool is_file_exist(const char *fileName)
    return infile.good();
 }

-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params,
-                         const server_params& sparams) {
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] \n", argv[0]);
    fprintf(stderr, "\n");
@ -137,6 +139,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
+    fprintf(stderr, "  -ac N,     --audio-ctx N       [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
@ -160,6 +163,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
+    fprintf(stderr, "  --request-path PATH,           [%-7s] Request path for all requests\n", sparams.request_path.c_str());
    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
    fprintf(stderr, "\n");
 }
@ -181,6 +185,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
+        else if (arg == "-ac"   || arg == "--audio-context")   { params.audio_ctx       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
@ -207,6 +212,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
+        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -393,36 +399,106 @@ std::string output_str(struct whisper_context * ctx, const whisper_params & para
    return result.str();
 }

+bool parse_str_to_bool(const std::string & s) {
+    if (s == "true" || s == "1" || s == "yes" || s == "y") {
+        return true;
+    }
+    return false;
+}
+
 void get_req_parameters(const Request & req, whisper_params & params)
 {
-    // user model configu.has_fileion
-    if (req.has_file("offset-t"))
+    if (req.has_file("offset_t"))
    {
-        params.offset_t_ms = std::stoi(req.get_file_value("offset-t").content);
+        params.offset_t_ms = std::stoi(req.get_file_value("offset_t").content);
    }
-    if (req.has_file("offset-n"))
+    if (req.has_file("offset_n"))
    {
-        params.offset_n = std::stoi(req.get_file_value("offset-n").content);
+        params.offset_n = std::stoi(req.get_file_value("offset_n").content);
    }
    if (req.has_file("duration"))
    {
        params.duration_ms = std::stoi(req.get_file_value("duration").content);
    }
-    if (req.has_file("max-context"))
+    if (req.has_file("max_context"))
    {
-        params.max_context = std::stoi(req.get_file_value("max-context").content);
+        params.max_context = std::stoi(req.get_file_value("max_context").content);
+    }
+    if (req.has_file("max_len"))
+    {
+        params.max_len = std::stoi(req.get_file_value("max_len").content);
+    }
+    if (req.has_file("best_of"))
+    {
+        params.best_of = std::stoi(req.get_file_value("best_of").content);
+    }
+    if (req.has_file("beam_size"))
+    {
+        params.beam_size = std::stoi(req.get_file_value("beam_size").content);
+    }
+    if (req.has_file("audio_ctx"))
+    {
+        params.audio_ctx = std::stof(req.get_file_value("audio_ctx").content);
+    }
+    if (req.has_file("word_thold"))
+    {
+        params.word_thold = std::stof(req.get_file_value("word_thold").content);
+    }
+    if (req.has_file("entropy_thold"))
+    {
+        params.entropy_thold = std::stof(req.get_file_value("entropy_thold").content);
+    }
+    if (req.has_file("logprob_thold"))
+    {
+        params.logprob_thold = std::stof(req.get_file_value("logprob_thold").content);
+    }
+    if (req.has_file("debug_mode"))
+    {
+        params.debug_mode = parse_str_to_bool(req.get_file_value("debug_mode").content);
+    }
+    if (req.has_file("translate"))
+    {
+        params.translate = parse_str_to_bool(req.get_file_value("translate").content);
+    }
+    if (req.has_file("diarize"))
+    {
+        params.diarize = parse_str_to_bool(req.get_file_value("diarize").content);
+    }
+    if (req.has_file("tinydiarize"))
+    {
+        params.tinydiarize = parse_str_to_bool(req.get_file_value("tinydiarize").content);
+    }
+    if (req.has_file("split_on_word"))
+    {
+        params.split_on_word = parse_str_to_bool(req.get_file_value("split_on_word").content);
+    }
+    if (req.has_file("no_timestamps"))
+    {
+        params.no_timestamps = parse_str_to_bool(req.get_file_value("no_timestamps").content);
+    }
+    if (req.has_file("language"))
+    {
+        params.language = req.get_file_value("language").content;
+    }
+    if (req.has_file("detect_language"))
+    {
+        params.detect_language = parse_str_to_bool(req.get_file_value("detect_language").content);
    }
    if (req.has_file("prompt"))
    {
        params.prompt = req.get_file_value("prompt").content;
    }
-    if (req.has_file("response-format"))
+    if (req.has_file("response_format"))
    {
-        params.response_format = req.get_file_value("response-format").content;
+        params.response_format = req.get_file_value("response_format").content;
    }
    if (req.has_file("temperature"))
    {
-        params.userdef_temp = std::stof(req.get_file_value("temperature").content);
+        params.temperature = std::stof(req.get_file_value("temperature").content);
+    }
+    if (req.has_file("temperature_inc"))
+    {
+        params.temperature_inc = std::stof(req.get_file_value("temperature_inc").content);
    }
 }

@ -455,7 +531,7 @@ int main(int argc, char ** argv) {
        check_ffmpeg_availibility();
    }
    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -471,19 +547,94 @@ int main(int argc, char ** argv) {
    Server svr;
    svr.set_default_headers({{"Server", "whisper.cpp"},
                             {"Access-Control-Allow-Origin", "*"},
-                             {"Access-Control-Allow-Headers", "content-type"}});
+                             {"Access-Control-Allow-Headers", "content-type, authorization"}});

-    std::string const default_content = "<html>hello</html>";
+    std::string const default_content = R"(
+    <html>
+    <head>
+        <title>Whisper.cpp Server</title>
+        <meta charset="utf-8">
+        <meta name="viewport" content="width=device-width">
+        <style>
+        body {
+            font-family: sans-serif;
+        }
+        form {
+            display: flex;
+            flex-direction: column;
+            align-items: flex-start;
+        }
+        label {
+            margin-bottom: 0.5rem;
+        }
+        input, select {
+            margin-bottom: 1rem;
+        }
+        button {
+            margin-top: 1rem;
+        }
+        </style>
+    </head>
+    <body>
+        <h1>Whisper.cpp Server</h1>
+
+        <h2>/inference</h2>
+        <pre>
+    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
+    -H "Content-Type: multipart/form-data" \
+    -F file="@&lt;file-path&gt;" \
+    -F temperature="0.0" \
+    -F temperature_inc="0.2" \
+    -F response_format="json"
+        </pre>
+
+        <h2>/load</h2>
+        <pre>
+    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
+    -H "Content-Type: multipart/form-data" \
+    -F model="&lt;path-to-model-file&gt;"
+        </pre>
+
+        <div>
+            <h2>Try it out</h2>
+            <form action="/inference" method="POST" enctype="multipart/form-data">
+                <label for="file">Choose an audio file:</label>
+                <input type="file" id="file" name="file" accept="audio/*" required><br>
+
+                <label for="temperature">Temperature:</label>
+                <input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
+
+                <label for="response_format">Response Format:</label>
+                <select id="response_format" name="response_format">
+                    <option value="verbose_json">Verbose JSON</option>
+                    <option value="json">JSON</option>
+                    <option value="text">Text</option>
+                    <option value="srt">SRT</option>
+                    <option value="vtt">VTT</option>
+                </select><br>
+
+                <button type="submit">Submit</button>
+            </form>
+        </div>
+    </body>
+    </html>
+    )";
+
+    // store default params so we can reset after each inference request
+    whisper_params default_params = params;

    // this is only called if no index.html is found in the public --path
-    svr.Get("/", [&default_content](const Request &, Response &res){
+    svr.Get(sparams.request_path + "/", [&default_content](const Request &, Response &res){
        res.set_content(default_content, "text/html");
        return false;
    });

-    svr.Post("/inference", [&](const Request &req, Response &res){
+    svr.Options(sparams.request_path + "/inference", [&](const Request &req, Response &res){
+    });
+
+    svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
        // acquire whisper model mutex lock
-        whisper_mutex.lock();
+        std::lock_guard<std::mutex> lock(whisper_mutex);

        // first check user requested fields of the request
        if (!req.has_file("file"))
@ -491,7 +642,6 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "error: no 'file' field in the request\n");
            const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
            res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
            return;
        }
        auto audio_file = req.get_file_value("file");
@ -506,35 +656,42 @@ int main(int argc, char ** argv) {
        std::vector<float> pcmf32;               // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

-        // write to temporary file
-        const std::string temp_filename = "whisper_server_temp_file.wav";
-        std::ofstream temp_file{temp_filename, std::ios::binary};
-        temp_file << audio_file.content;
-        temp_file.close();
-
-        // if file is not wav, convert to wav
-        
        if (sparams.ffmpeg_converter) {
+            // if file is not wav, convert to wav
+            // write to temporary file
+            const std::string temp_filename = "whisper_server_temp_file.wav";
+            std::ofstream temp_file{temp_filename, std::ios::binary};
+            temp_file << audio_file.content;
+            temp_file.close();
+
            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
            const bool is_converted = convert_to_wav(temp_filename, error_resp);
            if (!is_converted) {
                res.set_content(error_resp, "application/json");
-                whisper_mutex.unlock();
+                return;
+            }
+
+            // read wav content into pcmf32
+            if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
+            {
+                fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
+                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+                res.set_content(error_resp, "application/json");
+                std::remove(temp_filename.c_str());
+                return;
+            }
+            // remove temp file
+            std::remove(temp_filename.c_str());
+        } else {
+            if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
+            {
+                fprintf(stderr, "error: failed to read WAV file\n");
+                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+                res.set_content(error_resp, "application/json");
                return;
            }
        }

-        // read wav content into pcmf32
-        if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
-            const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-            res.set_content(error_resp, "application/json");
-            std::remove(temp_filename.c_str());
-            whisper_mutex.unlock();
-            return;
-        }
-        // remove temp file
-        std::remove(temp_filename.c_str());

        printf("Successfully loaded %s\n", filename.c_str());

@ -591,6 +748,7 @@ int main(int argc, char ** argv) {
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
+            wparams.audio_ctx        = params.audio_ctx;

            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;
@ -602,10 +760,14 @@ int main(int argc, char ** argv) {
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;

-            wparams.temperature_inc  = params.userdef_temp;
+            wparams.temperature      = params.temperature;
+            wparams.temperature_inc  = params.temperature_inc;
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;

+            wparams.no_timestamps    = params.no_timestamps;
+            wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
+
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

            // this callback is called on each new segment
@ -648,7 +810,6 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                const std::string error_resp = "{\"error\":\"failed to process audio\"}";
                res.set_content(error_resp, "application/json");
-                whisper_mutex.unlock();
                return;
            }
        }
@ -702,6 +863,59 @@ int main(int argc, char ** argv) {
                ss << speaker << text << "\n\n";
            }
            res.set_content(ss.str(), "text/vtt");
+        } else if (params.response_format == vjson_format) {
+            /* try to match openai/whisper's Python format */
+            std::string results = output_str(ctx, params, pcmf32s);
+            json jres = json{
+                {"task", params.translate ? "translate" : "transcribe"},
+                {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
+                {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
+                {"text", results},
+                {"segments", json::array()}
+            };
+            const int n_segments = whisper_full_n_segments(ctx);
+            for (int i = 0; i < n_segments; ++i)
+            {
+                json segment = json{
+                    {"id", i},
+                    {"text", whisper_full_get_segment_text(ctx, i)},
+                };
+
+                if (!params.no_timestamps) {
+                    segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
+                    segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
+                }
+
+                float total_logprob = 0;
+                const int n_tokens = whisper_full_n_tokens(ctx, i);
+                for (int j = 0; j < n_tokens; ++j) {
+                    whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
+                    if (token.id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+
+                    segment["tokens"].push_back(token.id);
+                    json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
+                    if (!params.no_timestamps) {
+                        word["start"] = token.t0 * 0.01;
+                        word["end"] = token.t1 * 0.01;
+                    }
+                    word["probability"] = token.p;
+                    total_logprob += token.plog;
+                    segment["words"].push_back(word);
+                }
+
+                segment["temperature"] = params.temperature;
+                segment["avg_logprob"] = total_logprob / n_tokens;
+
+                // TODO compression_ratio and no_speech_prob are not implemented yet
+                // segment["compression_ratio"] = 0;
+                // segment["no_speech_prob"] = 0;
+
+                jres["segments"].push_back(segment);
+            }
+            res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
+                            "application/json");
        }
        // TODO add more output formats
        else
@ -714,17 +928,16 @@ int main(int argc, char ** argv) {
                            "application/json");
        }

-        // return whisper model mutex lock
-        whisper_mutex.unlock();
+        // reset params to thier defaults
+        params = default_params;
    });
-    svr.Post("/load", [&](const Request &req, Response &res){
-        whisper_mutex.lock();
+    svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
+        std::lock_guard<std::mutex> lock(whisper_mutex);
        if (!req.has_file("model"))
        {
            fprintf(stderr, "error: no 'model' field in the request\n");
            const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
            res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
            return;
        }
        std::string model = req.get_file_value("model").content;
@ -733,7 +946,6 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
            const std::string error_resp = "{\"error\":\"model not found!\"}";
            res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
            return;
        }

@ -756,7 +968,6 @@ int main(int argc, char ** argv) {
        res.set_content(success, "application/text");

        // check if the model is in the file system
-        whisper_mutex.unlock();
    });

    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
@ -773,11 +984,11 @@ int main(int argc, char ** argv) {
        res.status = 500;
    });

-    svr.set_error_handler([](const Request &, Response &res) {
+    svr.set_error_handler([](const Request &req, Response &res) {
        if (res.status == 400) {
            res.set_content("Invalid request", "text/plain");
        } else if (res.status != 500) {
-            res.set_content("File Not Found", "text/plain");
+            res.set_content("File Not Found (" + req.path + ")", "text/plain");
            res.status = 404;
        }
    });
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -4,7 +4,7 @@ This is a naive example of performing real-time inference on audio from your mic
 The `stream` tool samples the audio every half a second and runs the transcription continously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

-```java
+```bash
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```

@ -14,7 +14,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a

 Setting the `--step` argument to `0` enables the sliding window mode:

-```java
+```bash
 ./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
 ```

@ -39,8 +39,8 @@ brew install sdl2
 make stream
 ```

-Ensure you are at the root of the repo when running `make stream`.  Not within the `examples/stream` dir
-as the libraries needed like `common-sdl.h` are located within `examples`.  Attempting to compile within
+Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
+as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
 `examples/steam` means your compiler cannot find them and it gives an error it cannot find the file.

 ```bash
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
        exit(0);
    }

-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,30 +1,18 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    #target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+    add_executable(${TARGET} talk-llama.cpp llama.cpp)
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})

-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET}
-        talk-llama.cpp
-        llama.cpp
-        ../common.cpp
-        ../common-sdl.cpp
-        ../../ggml.c
-        ../../ggml-alloc.c
-        ../../ggml-backend.c
-        ../../ggml-quants.c
-        ../../whisper.cpp)
+    if (WHISPER_CLBLAST)
+        set(CLBLAST_LIBNAME clblast)
+    endif ()
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CLBLAST_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})

    if(WIN32)
-    # It requires Windows 8.1 or later for PrefetchVirtualMemory
-    target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
+        # It requires Windows 8.1 or later for PrefetchVirtualMemory
+        target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
    endif()

-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
    include(DefaultTargetOptions)
 endif ()
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -2,12 +2,8 @@
 #define LLAMA_H

 #include "ggml.h"
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
-#else
-#define LLAMA_MAX_DEVICES 1
-#endif // GGML_USE_CUBLAS
+#include "ggml-backend.h"
+
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@ -39,15 +35,11 @@

 #define LLAMA_MAX_RNG_STATE (64*1024)

+#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
-
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
-// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
-#define LLAMA_SUPPORTS_GPU_OFFLOAD
-#endif
+#define LLAMA_SESSION_VERSION 4

 #ifdef __cplusplus
 extern "C" {
@ -69,6 +61,7 @@ extern "C" {
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
    };

    enum llama_token_type {
@ -102,6 +95,11 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@ -114,6 +112,12 @@ extern "C" {
        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
    };

+    enum llama_split_mode {
+        LLAMA_SPLIT_NONE    = 0, // single GPU
+        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
+    };
+
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@ -126,7 +130,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;

-    typedef void (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void *ctx);

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -158,16 +162,46 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
+
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
-        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
-        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+        enum llama_split_mode split_mode; // how to split the model across multiple GPUs

-        // called with a progress value between 0 and 1, pass NULL to disable
+        // main_gpu interpretation depends on split_mode:
+        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
+        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
+        // LLAMA_SPLIT_LAYER: ignored
+        int32_t main_gpu;
+
+        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
+        const float * tensor_split;
+
+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+        // If the provided progress_callback returns true, model loading continues.
+        // If it returns false, model loading is immediately aborted.
        llama_progress_callback progress_callback;
+
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
@ -180,32 +214,39 @@ extern "C" {
        uint32_t n_batch;           // prompt processing maximum batch size
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
-        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+        int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
        float    yarn_attn_factor; // YaRN magnitude scaling factor
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size

+        ggml_backend_sched_eval_callback cb_eval;
+        void * cb_eval_user_data;
+
+        enum ggml_type type_k; // data type for K cache
+        enum ggml_type type_v; // data type for V cache
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool embedding;  // embedding mode only
+        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embedding;   // embedding mode only
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
    };

    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+        void * imatrix;              // pointer to importance matrix data
    } llama_model_quantize_params;

    // grammar types
@ -284,25 +325,48 @@ extern "C" {

    LLAMA_API int64_t llama_time_us(void);

-    LLAMA_API int  llama_max_devices    (void);
-    LLAMA_API bool llama_mmap_supported (void);
-    LLAMA_API bool llama_mlock_supported(void);
+    LLAMA_API size_t llama_max_devices(void);
+
+    LLAMA_API bool llama_supports_mmap       (void);
+    LLAMA_API bool llama_supports_mlock      (void);
+    LLAMA_API bool llama_supports_gpu_offload(void);
+
+    LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
+    LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");

    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

-    LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);

    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);

-    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
+    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);

+    // Functions to access the model's GGUF metadata scalar values
+    // - The functions return the length of the string on success, or -1 on failure
+    // - The output string is always null-terminated and cleared on failure
+    // - GGUF array values are not supported by these functions
+
+    // Get metadata value as a string by key name
+    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+
+    // Get the number of metadata key/value pairs
+    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
+
+    // Get metadata key name by index
+    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
+
+    // Get metadata value as a string by index
+    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
+
    // Get a string describing the model type
-    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

    // Returns the total size of all the tensors in the model in bytes
    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
@ -314,7 +378,7 @@ extern "C" {
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);

    // Returns 0 on success
-    LLAMA_API int llama_model_quantize(
+    LLAMA_API uint32_t llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
            const llama_model_quantize_params * params);
@ -325,28 +389,79 @@ extern "C" {
    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
    // will be applied on top of the previous one
    // Returns 0 on success
-    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
-                             int   n_threads),
+                         int32_t   n_threads),
            "use llama_model_apply_lora_from_file instead");

-    LLAMA_API int llama_model_apply_lora_from_file(
+    LLAMA_API int32_t llama_model_apply_lora_from_file(
            const struct llama_model * model,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
-                             int   n_threads);
+                         int32_t   n_threads);

    //
    // KV cache
    //

-    // Returns the number of tokens in the KV cache
-    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "avoid using this, it will be removed in the future, instead - count the tokens in user code");
+    // Information associated with an individual cell in the KV cache view.
+    struct llama_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
+        llama_pos pos;
+    };
+
+    // An updateable view of the KV cache.
+    struct llama_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
+        int32_t n_cells;
+
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
+        int32_t n_max_seq;
+
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
+        int32_t token_count;
+
+        // Number of populated cache cells.
+        int32_t used_cells;
+
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;
+
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;
+
+        // Information for an individual cell.
+        struct llama_kv_cache_view_cell * cells;
+
+        // The sequences for each cell. There will be n_max_seq items per cell.
+        llama_seq_id * cells_sequences;
+    };
+
+    // Create an empty KV cache view. (use only for debugging purposes)
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+
+    // Free a KV cache view. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+    // Returns the number of tokens in the KV cache (slow, use only for debug)
+    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+
+    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);

    // Clear the KV cache
    LLAMA_API void llama_kv_cache_clear(
@ -389,6 +504,17 @@ extern "C" {
                       llama_pos   p1,
                       llama_pos   delta);

+    // Integer division of the positions by factor of `d > 1`
+    // If the KV cache is RoPEd, the KV data is updated accordingly
+    // p0 < 0 : [0,  p1]
+    // p1 < 0 : [p0, inf)
+    LLAMA_API void llama_kv_cache_seq_div(
+            struct llama_context * ctx,
+                    llama_seq_id   seq_id,
+                       llama_pos   p0,
+                       llama_pos   p1,
+                             int   d);
+
    //
    // State / sessions
    //
@ -437,7 +563,7 @@ extern "C" {
            struct llama_context * ctx,
                     llama_token * tokens,
                         int32_t   n_tokens,
-                             int   n_past),
+                         int32_t   n_past),
            "use llama_decode() instead");

    // Same as llama_eval, but use float matrix input directly.
@ -446,7 +572,7 @@ extern "C" {
            struct llama_context * ctx,
                           float * embd,
                         int32_t   n_tokens,
-                             int   n_past),
+                         int32_t   n_past),
            "use llama_decode() instead");

    // Return batch for single sequence of tokens starting at pos_0
@ -478,7 +604,7 @@ extern "C" {
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
    // < 0 - error
-    LLAMA_API int llama_decode(
+    LLAMA_API int32_t llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);

@ -517,6 +643,12 @@ extern "C" {
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line

+    // Returns -1 if unknown, 1 for true or 0 for false.
+    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
+
+    // Returns -1 if unknown, 1 for true or 0 for false.
+    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
+
    // codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@ -533,12 +665,12 @@ extern "C" {
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
    ///                Does not insert a leading space.
-    LLAMA_API int llama_tokenize(
+    LLAMA_API int32_t llama_tokenize(
        const struct llama_model * model,
                      const char * text,
-                             int   text_len,
+                         int32_t   text_len,
                     llama_token * tokens,
-                             int   n_max_tokens,
+                         int32_t   n_max_tokens,
                            bool   add_bos,
                            bool   special);

@ -546,11 +678,11 @@ extern "C" {
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    LLAMA_API int llama_token_to_piece(
+    LLAMA_API int32_t llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
-                                  int    length);
+                               int32_t   length);

    //
    // Grammar
@ -584,14 +716,21 @@ extern "C" {
                           float   penalty_present);

    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
-    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_classifier_free_guidance(
+    /// @param logits Logits extracted from the original generation context.
+    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void llama_sample_apply_guidance(
+              struct llama_context * ctx,
+                             float * logits,
+                             float * logits_guidance,
+                             float   scale);
+
+    LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
              struct llama_context * ctx,
            llama_token_data_array * candidates,
              struct llama_context * guidance_ctx,
-                             float   scale);
+                             float   scale),
+              "use llama_sample_apply_guidance() instead");

    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    LLAMA_API void llama_sample_softmax(
@ -602,7 +741,7 @@ extern "C" {
    LLAMA_API void llama_sample_top_k(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
-                             int   k,
+                         int32_t   k,
                          size_t   min_keep);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@ -633,6 +772,14 @@ extern "C" {
                           float   p,
                          size_t   min_keep);

+    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+    LLAMA_API void llama_sample_entropy(
+            struct llama_context * ctx,
+          llama_token_data_array * candidates_p,
+                           float   min_temp,
+                           float   max_temp,
+                           float   exponent_val);
+
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
@ -661,7 +808,7 @@ extern "C" {
          llama_token_data_array * candidates,
                           float   tau,
                           float   eta,
-                             int   m,
+                         int32_t   m,
                           float * mu);

    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@ -734,8 +881,8 @@ extern "C" {
        llama_beam_search_callback_fn_t   callback,
                                   void * callback_data,
                                 size_t   n_beams,
-                                    int   n_past,
-                                    int   n_predict);
+                                int32_t   n_past,
+                                int32_t   n_predict);

    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
--- a/examples/talk-llama/speak
+++ b/examples/talk-llama/speak
@ -9,6 +9,14 @@
 #
 #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"

+# piper
+#
+# https://github.com/rhasspy/piper
+#
+# Tested with Linux:
+#
+#echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
+
 # for Mac
 say "$2"

--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -14,6 +14,7 @@
 #include <thread>
 #include <vector>
 #include <regex>
+#include <sstream>

 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    auto * model = llama_get_model(ctx);
@ -67,6 +68,9 @@ struct whisper_params {
    bool use_gpu        = true;

    std::string person      = "Georgi";
+    std::string bot_name    = "LLaMA";
+    std::string wake_cmd    = "";
+    std::string heard_ok    = "";
    std::string language    = "en";
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
@ -101,7 +105,10 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
-        else if (arg == "--session")                         { params.path_session   = argv[++i];}
+        else if (arg == "-bn"   || arg == "--bot-name")      { params.bot_name       = argv[++i]; }
+        else if (arg == "--session")                         { params.path_session   = argv[++i]; }
+        else if (arg == "-w"   || arg == "--wake-command")   { params.wake_cmd       = argv[++i]; }
+        else if (arg == "-ho"  || arg == "--heard-ok")       { params.heard_ok       = argv[++i]; }
        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
@ -146,6 +153,9 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
+    fprintf(stderr, "  -bn NAME, --bot-name NAME  [%-7s] bot name (to display)\n",                       params.bot_name.c_str());
+    fprintf(stderr, "  -w TEXT,  --wake-command T [%-7s] wake-up command to listen for\n",               params.wake_cmd.c_str());
+    fprintf(stderr, "  -ho TEXT, --heard-ok TEXT  [%-7s] said by TTS before generating reply\n",         params.heard_ok.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
@ -224,6 +234,18 @@ std::string transcribe(
    return result;
 }

+std::vector<std::string> get_words(const std::string &txt) {
+    std::vector<std::string> words;
+
+    std::istringstream iss(txt);
+    std::string word;
+    while (iss >> word) {
+        words.push_back(word);
+    }
+
+    return words;
+}
+
 const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";

 const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
@ -259,7 +281,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
@ -282,7 +304,6 @@ int main(int argc, char ** argv) {
    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
-    lcparams.f16_kv     = true;
    lcparams.n_threads  = params.n_threads;

    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
@ -324,12 +345,11 @@ int main(int argc, char ** argv) {
    float prob0 = 0.0f;

    const std::string chat_symb = ":";
-    const std::string bot_name  = "LLaMA";

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);
+    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", params.bot_name);

    // construct the initial prompt for LLaMA inference
    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
@ -338,7 +358,7 @@ int main(int argc, char ** argv) {
    prompt_llama.insert(0, 1, ' ');

    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
-    prompt_llama = ::replace(prompt_llama, "{1}", bot_name);
+    prompt_llama = ::replace(prompt_llama, "{1}", params.bot_name);

    {
        // get time string
@ -440,6 +460,16 @@ int main(int argc, char ** argv) {
    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);

    printf("%s : done! start speaking in the microphone\n", __func__);
+
+    // show wake command if enabled
+    const std::string wake_cmd = params.wake_cmd;
+    const int wake_cmd_length = get_words(wake_cmd).size();
+    const bool use_wake_cmd = wake_cmd_length > 0;
+
+    if (use_wake_cmd) {
+        printf("%s : the wake-up command is: '%s%s%s'\n", __func__, "\033[1m", wake_cmd.c_str(), "\033[0m");
+    }
+
    printf("\n");
    printf("%s%s", params.person.c_str(), chat_symb.c_str());
    fflush(stdout);
@ -485,10 +515,41 @@ int main(int argc, char ** argv) {

                audio.get(params.voice_ms, pcmf32_cur);

-                std::string text_heard;
+                std::string all_heard;

                if (!force_speak) {
-                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
+                    all_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
+                }
+
+                const auto words = get_words(all_heard);
+
+                std::string wake_cmd_heard;
+                std::string text_heard;
+
+                for (int i = 0; i < (int) words.size(); ++i) {
+                    if (i < wake_cmd_length) {
+                        wake_cmd_heard += words[i] + " ";
+                    } else {
+                        text_heard += words[i] + " ";
+                    }
+                }
+
+                // check if audio starts with the wake-up command if enabled
+                if (use_wake_cmd) {
+                    const float sim = similarity(wake_cmd_heard, wake_cmd);
+
+                    if ((sim < 0.7f) || (text_heard.empty())) {
+                        audio.clear();
+                        continue;
+                    }
+                }
+
+                // optionally give audio feedback that the current text is being processed
+                if (!params.heard_ok.empty()) {
+                    int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + params.heard_ok + "'").c_str());
+                    if (ret != 0) {
+                        fprintf(stderr, "%s: failed to speak\n", __func__);
+                    }
                }

                // remove text between brackets using regex
@ -525,7 +586,7 @@ int main(int argc, char ** argv) {
                force_speak = false;

                text_heard.insert(0, 1, ' ');
-                text_heard += "\n" + bot_name + chat_symb;
+                text_heard += "\n" + params.bot_name + chat_symb;
                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
                fflush(stdout);

@ -658,6 +719,7 @@ int main(int argc, char ** argv) {
                            text_to_speak += llama_token_to_piece(ctx_llama, id);

                            printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
+                            fflush(stdout);
                        }
                    }

--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@ -2,8 +2,9 @@

 #include <cassert>
 #include <stdexcept>
-#include <vector>
+#include <string>
 #include <unordered_map>
+#include <vector>

 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
+        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -524,8 +524,7 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
+                        1.0f/sqrt(float(n_embd)/n_head));

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
+        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -525,8 +525,7 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
+                        1.0f/sqrt(float(n_embd)/n_head));

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
    }

    // whisper init
-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
--- a/examples/wchess/README.md
+++ b/examples/wchess/README.md
@ -33,8 +33,13 @@ White's turn

 ## TODO

- Improve web-browser audio capture - sometimes it does not record the voice properly
- Add support for more languages by making the generated grammar string multi-lingual
 - Fix bugs in the chess moves logic
+- Improve web-browser audio capture - sometimes it does not record the voice properly
+- Add support for more languages by making the generated grammar string multilingual
+- Explore ways to improve the dynamic grammar to be narrower

 PRs welcome!
+
+## Thanks
+
+- [chessboardjs](https://chessboardjs.com) for the neat chessboard JS library used in this demo
--- a/examples/wchess/wchess.cmd/wchess.cmd.cpp
+++ b/examples/wchess/wchess.cmd/wchess.cmd.cpp
@ -182,7 +182,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams;
+    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -12,3 +12,47 @@ To use:
 (PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)

 <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
+
+## CLBlast
+
+> [!NOTE]
+> - OpenCL does not have the same level of support as CUDA or Metal.
+> - Turning on CLBlast may degrade OpenCL performance if your device isn't already tuned. See [tuning.md](https://github.com/CNugteren/CLBlast/blob/162783a414969464ce3aa5adf5c2554afa5ee93e/doc/tuning.md#already-tuned-for-devices) for a list of devices that are already tuned and what to do if yours is missing.
+
+Build CLBlast.
+
+```
+# In path/to/CLBlast (we assume OpenCL-Headers relative location)
+$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
+    -DCMAKE_SYSTEM_NAME=Android \
+    -DCMAKE_SYSTEM_VERSION=33 \
+    -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
+    -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
+    -DCMAKE_ANDROID_STL_TYPE=c++_static \
+    -DOPENCL_ROOT=$(readlink -f ../../OpenCL-Headers) \
+    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
+    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
+
+# Build libclblast.so
+make -j4
+```
+
+Pull `libGLES_mali.so` to `libOpenCL.so`.
+
+```bash
+# In path/to/whisper.android
+mkdir lib/src/main/jniLibs/arm64-v8a
+adb pull /system/vendor/lib64/egl/libGLES_mali.so lib/src/main/jniLibs/arm64-v8a/libOpenCL.so
+```
+
+In gradle.properties, set `GGML_HOME` to the location of GGML, as well as
+required options for turning on CLBlast.
+
+```
+GGML_HOME=/path/to/ggml
+GGML_CLBLAST=ON
+CLBLAST_HOME=/path/to/CLBlast
+OPENCL_LIB=/path/to/libOpenCL.so
+OPENCL_ROOT=/path/to/OpenCL-Headers
+```
+
--- a/examples/whisper.android/lib/build.gradle
+++ b/examples/whisper.android/lib/build.gradle
@ -16,6 +16,28 @@ android {
        ndk {
            abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
        }
+        externalNativeBuild {
+            cmake {
+                // When set, builds whisper.android against the version located
+                // at GGML_HOME instead of the copy bundled with whisper.cpp.
+                if (
+                    project.hasProperty('GGML_HOME') &&
+                    project.findProperty('GGML_CLBLAST') == 'ON'
+                ) {
+                    // Turning on CLBlast requires GGML_HOME
+                    arguments "-DGGML_HOME=${project.property('GGML_HOME')}",
+                         "-DGGML_CLBLAST=ON",
+                         "-DOPENCL_LIB=${project.property('OPENCL_LIB')}",
+                         "-DCLBLAST_HOME=${project.property('CLBLAST_HOME')}",
+                         "-DOPENCL_ROOT=${project.property('OPENCL_ROOT')}",
+                         "-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH",
+                         "-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH"
+                } else if (project.hasProperty('GGML_HOME')) {
+                    arguments "-DGGML_HOME=${project.property('GGML_HOME')}"
+                }
+
+            }
+        }
    }

    buildTypes {
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@ -3,17 +3,28 @@ cmake_minimum_required(VERSION 3.10)
 project(whisper.cpp)

 set(CMAKE_CXX_STANDARD 11)
-set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
+set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../..)
+
+# Path to external GGML, otherwise uses the copy in whisper.cpp.
+option(GGML_HOME       "whisper: Path to external GGML source" OFF)

 set(
        SOURCE_FILES
+        ${WHISPER_LIB_DIR}/whisper.cpp
+        ${CMAKE_SOURCE_DIR}/jni.c
+)
+
+if (NOT GGML_HOME)
+    set(
+        SOURCE_FILES
+        ${SOURCE_FILES}
        ${WHISPER_LIB_DIR}/ggml.c
        ${WHISPER_LIB_DIR}/ggml-alloc.c
        ${WHISPER_LIB_DIR}/ggml-backend.c
        ${WHISPER_LIB_DIR}/ggml-quants.c
-        ${WHISPER_LIB_DIR}/whisper.cpp
-        ${CMAKE_SOURCE_DIR}/jni.c
-)
+
+    )
+endif()

 find_library(LOG_LIB log)

@ -24,12 +35,12 @@ function(build_library target_name)
        ${SOURCE_FILES}
    )

-    target_link_libraries(${target_name} ${LOG_LIB} android)
-
    if (${target_name} STREQUAL "whisper_v8fp16_va")
        target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
+        set(GGML_COMPILE_OPTIONS                      -march=armv8.2-a+fp16)
    elseif (${target_name} STREQUAL "whisper_vfpv4")
        target_compile_options(${target_name} PRIVATE -mfpu=neon-vfpv4)
+        set(GGML_COMPILE_OPTIONS                      -mfpu=neon-vfpv4)
    endif ()

    if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
@ -43,9 +54,20 @@ function(build_library target_name)
        target_link_options(${target_name} PRIVATE -flto)

    endif ()
-endfunction()

-build_library("whisper") # Default target
+    if (GGML_HOME)
+        include(FetchContent)
+        FetchContent_Declare(ggml SOURCE_DIR ${GGML_HOME})
+        FetchContent_MakeAvailable(ggml)
+
+        target_compile_options(ggml PRIVATE ${GGML_COMPILE_OPTIONS})
+        target_link_libraries(${target_name} ${LOG_LIB} android ggml)
+    else()
+        target_link_libraries(${target_name} ${LOG_LIB} android)
+    endif()
+
+
+endfunction()

 if (${ANDROID_ABI} STREQUAL "arm64-v8a")
    build_library("whisper_v8fp16_va")
@ -53,4 +75,6 @@ elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
    build_library("whisper_vfpv4")
 endif ()

+build_library("whisper") # Default target
+
 include_directories(${WHISPER_LIB_DIR})
--- a/examples/whisper.android/lib/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/lib/src/main/jni/whisper/jni.c
@ -228,6 +228,7 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, j
    UNUSED(thiz);
    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
+    return string;
 }

 JNIEXPORT jstring JNICALL
@ -236,4 +237,5 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *en
    UNUSED(thiz);
    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
+    return string;
 }
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -11,11 +11,11 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b

 ## Usage

-```java
+```bash
 git clone https://github.com/ggerganov/whisper.cpp
 open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/

-// If you don't want to convert a Core ML model, you can skip this step by create dummy model
+# if you don't want to convert a Core ML model, you can skip this step by create dummy model
 mkdir models/ggml-base.en-encoder.mlmodelc
 ```

--- a/examples/whisper.swiftui/.gitignore
+++ b/examples/whisper.swiftui/.gitignore
@ -0,0 +1,2 @@
+xcuserdata
+xcshareddata
--- a/extra/bench.py
+++ b/extra/bench.py
@ -61,7 +61,9 @@ models = [
    "ggml-small.bin",
    "ggml-medium.en.bin",
    "ggml-medium.bin",
-    "ggml-large.bin",
+    "ggml-large-v1.bin",
+    "ggml-large-v2.bin",
+    "ggml-large-v3.bin",
 ]


--- a/extra/sync-ggml-am.sh
+++ b/extra/sync-ggml-am.sh
@ -0,0 +1,178 @@
+#!/bin/bash
+#
+# Synchronize ggml changes to whisper.cpp
+#
+# Usage:
+#
+#   $ cd /path/to/whisper.cpp
+#   $ ./extra/sync-ggml-am.sh -skip hash0,hash1,hash2...
+#
+
+set -e
+
+sd=$(dirname $0)
+cd $sd/../
+
+SRC_WHISPER=$(pwd)
+SRC_GGML=$(cd ../ggml; pwd)
+
+if [ ! -d $SRC_GGML ]; then
+    echo "ggml not found at $SRC_GGML"
+    exit 1
+fi
+
+lc=$(cat $SRC_WHISPER/extra/sync-ggml.last)
+echo "Syncing ggml changes since commit $lc"
+
+to_skip=""
+if [ "$1" == "-skip" ]; then
+    to_skip=$2
+fi
+
+cd $SRC_GGML
+
+git log --oneline $lc..HEAD
+git log --oneline $lc..HEAD --reverse | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits
+
+if [ ! -s $SRC_WHISPER/ggml-commits ]; then
+    rm -v $SRC_WHISPER/ggml-commits
+    echo "No new commits"
+    exit 0
+fi
+
+if [ -f $SRC_WHISPER/ggml-src.patch ]; then
+    rm -v $SRC_WHISPER/ggml-src.patch
+fi
+
+while read c; do
+    if [ -n "$to_skip" ]; then
+        if [[ $to_skip == *"$c"* ]]; then
+            echo "Skipping $c"
+            continue
+        fi
+    fi
+
+    git format-patch -k $c~1..$c --stdout -- \
+        include/ggml/ggml*.h \
+        src/ggml*.h \
+        src/ggml*.c \
+        src/ggml*.cpp \
+        src/ggml*.m \
+        src/ggml*.metal \
+        src/ggml*.cu \
+        examples/common.h \
+        examples/common.cpp \
+        examples/common-ggml.h \
+        examples/common-ggml.cpp \
+        examples/whisper/whisper.h \
+        examples/whisper/whisper.cpp \
+        examples/whisper/main.cpp \
+        examples/whisper/quantize.cpp \
+        >> $SRC_WHISPER/ggml-src.patch
+done < $SRC_WHISPER/ggml-commits
+
+rm -v $SRC_WHISPER/ggml-commits
+
+# delete files if empty
+if [ ! -s $SRC_WHISPER/ggml-src.patch ]; then
+    rm -v $SRC_WHISPER/ggml-src.patch
+fi
+
+cd $SRC_WHISPER
+
+if [ -f $SRC_WHISPER/ggml-src.patch ]; then
+    # replace PR numbers
+    #
+    # Subject: some text (#1234)
+    # Subject: some text (ggml/1234)
+    cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
+    mv ggml-src.patch.tmp ggml-src.patch
+
+    cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
+    mv ggml-src.patch.tmp ggml-src.patch
+
+    # replace filenames:
+    #
+    # src/ggml.c                  -> ggml.c
+    # src/ggml-alloc.c            -> ggml-alloc.c
+    # src/ggml-backend-impl.h     -> ggml-backend-impl.h
+    # src/ggml-backend.c          -> ggml-backend.c
+    # src/ggml-cuda.cu            -> ggml-cuda.cu
+    # src/ggml-cuda.h             -> ggml-cuda.h
+    # src/ggml-impl.h             -> ggml-impl.h
+    # src/ggml-kompute.cpp        -> ggml-kompute.cpp
+    # src/ggml-kompute.h          -> ggml-kompute.h
+    # src/ggml-metal.h            -> ggml-metal.h
+    # src/ggml-metal.m            -> ggml-metal.m
+    # src/ggml-mpi.h              -> ggml-mpi.h
+    # src/ggml-mpi.c              -> ggml-mpi.c
+    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
+    # src/ggml-opencl.h           -> ggml-opencl.h
+    # src/ggml-quants.c           -> ggml-quants.c
+    # src/ggml-quants.h           -> ggml-quants.h
+    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
+    # src/ggml-sycl.h             -> ggml-sycl.h
+    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
+    # src/ggml-vulkan.h           -> ggml-vulkan.h
+    # include/ggml/ggml.h         -> ggml.h
+    # include/ggml/ggml-alloc.h   -> ggml-alloc.h
+    # include/ggml/ggml-backend.h -> ggml-backend.h
+    #
+    # examples/common.h           -> examples/common.h
+    # examples/common.cpp         -> examples/common.cpp
+    # examples/common-ggml.h      -> examples/common-ggml.h
+    # examples/common-ggml.cpp    -> examples/common-ggml.cpp
+    #
+    # examples/whisper/whisper.h    -> whisper.h
+    # examples/whisper/whisper.cpp  -> whisper.cpp
+    # examples/whisper/main.cpp     -> examples/main/main.cpp
+    # examples/whisper/quantize.cpp -> examples/quantize/quantize.cpp
+
+    cat ggml-src.patch | sed \
+        -e 's/src\/ggml\.c/ggml.c/g' \
+        -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
+        -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
+        -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
+        -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
+        -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
+        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
+        -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
+        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
+        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
+        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
+        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
+        -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
+        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
+        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
+        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
+        -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
+        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
+        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
+        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
+        -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
+        -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
+        -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
+        -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
+        -e 's/examples\/common\.h/examples\/common.h/g' \
+        -e 's/examples\/common\.cpp/examples\/common.cpp/g' \
+        -e 's/examples\/common-ggml\.h/examples\/common-ggml.h/g' \
+        -e 's/examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
+        -e 's/examples\/whisper\/whisper\.h/whisper.h/g' \
+        -e 's/examples\/whisper\/whisper\.cpp/whisper.cpp/g' \
+        -e 's/examples\/whisper\/main\.cpp/examples\/main\/main.cpp/g' \
+        -e 's/examples\/whisper\/quantize\.cpp/examples\/quantize\/quantize.cpp/g' \
+        > ggml-src.patch.tmp
+    mv ggml-src.patch.tmp ggml-src.patch
+
+    git am ggml-src.patch
+
+    rm -v $SRC_WHISPER/ggml-src.patch
+fi
+
+# update last commit
+cd $SRC_GGML
+git log -1 --format=%H > $SRC_WHISPER/extra/sync-ggml.last
+
+echo "Done"
+
+exit 0
--- a/extra/sync-ggml.last
+++ b/extra/sync-ggml.last
@ -0,0 +1 @@
+15438356acd7ad1b182c66272eb9625828f5ae7a
--- a/extra/sync-ggml.sh
+++ b/extra/sync-ggml.sh
@ -7,6 +7,8 @@ cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h
 cp -rpv ../ggml/src/ggml-backend.c      ./ggml-backend.c
 cp -rpv ../ggml/src/ggml-cuda.cu        ./ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-cuda.h         ./ggml-cuda.h
+cp -rpv ../ggml/src/ggml-kompute.cpp    ./ggml-kompute.cpp
+cp -rpv ../ggml/src/ggml-kompute.h      ./ggml-kompute.h
 cp -rpv ../ggml/src/ggml-metal.h        ./ggml-metal.h
 cp -rpv ../ggml/src/ggml-metal.m        ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal    ./ggml-metal.metal
@ -16,6 +18,10 @@ cp -rpv ../ggml/src/ggml-opencl.cpp     ./ggml-opencl.cpp
 cp -rpv ../ggml/src/ggml-opencl.h       ./ggml-opencl.h
 cp -rpv ../ggml/src/ggml-quants.c       ./ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h       ./ggml-quants.h
+cp -rpv ../ggml/src/ggml-sycl.cpp       ./ggml-sycl.cpp
+cp -rpv ../ggml/src/ggml-sycl.h         ./ggml-sycl.h
+cp -rpv ../ggml/src/ggml-vulkan.cpp     ./ggml-vulkan.cpp
+cp -rpv ../ggml/src/ggml-vulkan.h       ./ggml-vulkan.h

 cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
 cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
--- a/extra/sync-llama.sh
+++ b/extra/sync-llama.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+cp -rpv ../llama.cpp/llama.h   ./examples/talk-llama/llama.h
+cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
+cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -6,86 +6,62 @@
 extern "C" {
 #endif

-struct ggml_backend;
-struct ggml_backend_buffer;
-struct ggml_backend_buffer_type;
-
-//
-// Legacy API
-//
-
-typedef struct ggml_allocr * ggml_allocr_t;
-
-// initialize allocator for use with CPU backend only
-GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
-
-// initialize allocator for use with ggml-backend
-GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
-
-GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
-
-// tell the allocator to parse nodes following the order described in the list
-// you should call this if your graph are optimized to execute out-of-order
-GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
-
-GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
-GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
-
-GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
-
-//
-// ggml-backend v2 API
-//
-
-// Separate tensor and graph allocator objects
-// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
-// The original API is kept as a wrapper around the new API
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend * ggml_backend_t;

 // Tensor allocator
 typedef struct ggml_tallocr * ggml_tallocr_t;

-GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
-
-GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
-
-GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
-GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
-
+GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
+GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);

 // Graph allocator
+/*
+  Example usage:
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+
+    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+    ggml_gallocr_reserve(galloc, build_graph(max_batch));
+
+    // allocate the graph
+    struct ggml_cgraph * graph = build_graph(batch);
+    ggml_gallocr_alloc_graph(galloc, graph);
+
+    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
+
+    // evaluate the graph
+    ggml_backend_graph_compute(backend, graph);
+*/
+
+// special tensor flags for use with the graph allocator:
+//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+//   ggml_set_output(): output tensors are never freed and never overwritten
+
 typedef struct ggml_gallocr * ggml_gallocr_t;

-GGML_API ggml_gallocr_t ggml_gallocr_new(void);
-GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);

-GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
-GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);

-// Allocate tensors from the allocators given by the hash table
-GGML_API void   ggml_gallocr_alloc_graph_n(
-                    ggml_gallocr_t galloc,
-                    struct ggml_cgraph * graph,
-                    struct ggml_hash_set hash_set,
-                    ggml_tallocr_t * hash_node_talloc);
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);

+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);

 #ifdef  __cplusplus
 }
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -16,10 +16,15 @@ extern "C" {
    typedef void * ggml_backend_buffer_type_context_t;

    struct ggml_backend_buffer_type_i {
-        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        // check if tensor data is in host memory
+        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
    };

    struct ggml_backend_buffer_type {
@ -31,15 +36,15 @@ extern "C" {
    typedef void * ggml_backend_buffer_context_t;

    struct ggml_backend_buffer_i {
-        void     (*free_buffer)(ggml_backend_buffer_t buffer);
-        //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-        void *   (*get_base)   (ggml_backend_buffer_t buffer);
-        void     (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void     (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void     (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
-        void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
+        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
+        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
    };

    struct ggml_backend_buffer {
@ -47,14 +52,22 @@ extern "C" {
        ggml_backend_buffer_type_t    buft;
        ggml_backend_buffer_context_t context;
        size_t size;
+        enum ggml_backend_buffer_usage usage;
    };

-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
                   ggml_backend_buffer_type_t      buft,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);

+    // do not use directly, use ggml_backend_tensor_copy instead
+    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+
+    // buffer that contains a collection of buffers
+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);

    //
    // Backend
@ -63,33 +76,31 @@ extern "C" {
    typedef void * ggml_backend_context_t;

    struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);

-        void (*free)(ggml_backend_t backend);
+        void (*GGML_CALL free)(ggml_backend_t backend);

        // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);

-        // (optional) asynchroneous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) asynchronous tensor data access
+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);

-        // (optional) asynchroneous tensor copy
-        void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        void (*synchronize)     (ggml_backend_t backend);
+        // (optional) complete all pending operations
+        void (*GGML_CALL synchronize)(ggml_backend_t backend);

        // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

-        // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        // compute graph without a plan (async)
+        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
-        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
    };

    struct ggml_backend {
@ -98,14 +109,13 @@ extern "C" {
        ggml_backend_context_t context;
    };

-
    //
    // Backend registry
    //

-    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);

-    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);

 #ifdef  __cplusplus
 }
--- a/ggml-backend.c
+++ b/ggml-backend.c
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -17,19 +17,33 @@ extern "C" {
    //

    // buffer type
-    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
-    GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
+    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
+    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);

    // buffer
-    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
+    enum ggml_backend_buffer_usage {
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
+    };
+
+    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);

    //
    // Backend
@ -42,12 +56,13 @@ extern "C" {
    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);

    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);

@ -55,7 +70,7 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
@ -68,13 +83,18 @@ extern "C" {

    GGML_API ggml_backend_t ggml_backend_cpu_init(void);

-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

    // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);

-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif

    //
    // Backend registry
@ -110,11 +130,7 @@ extern "C" {

        // in build_graph:
        build_graph(...) {
-            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
-            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
-            ggml_allocr_alloc(alloc_cpu, tensor);
-
-            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
+            // manually assign nodes to a backend (optional, should not be needed in most cases)
            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
        }
@ -132,24 +148,35 @@ extern "C" {
    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;

+    // when ask == true, the scheduler wants to know if the user wants to observe this node
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
+    //
+    // when ask == false, the scheduler is passing the node tensor to the user for observation
+    // if the user returns false, the scheduler will cancel the graph compute
+    //
+    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
+
    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
-
-    GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
-
+    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
+    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
-    GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    // Get the number of splits of the last graph
+    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);

-    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

-    GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

-    // Allocate a graph on the backend scheduler
-    GGML_API void ggml_backend_sched_graph_compute(
-            ggml_backend_sched_t sched,
-            struct ggml_cgraph * graph);
+    // Allocate and compute graph on the backend scheduler
+    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);

+    // Reset all assignments and allocators - must be called before changing the node backends
+    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
+
+    // Set a callback to be called for each resulting node during graph compute
+    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);

    //
    // Utils
@ -166,10 +193,10 @@ extern "C" {
    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);

-    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);

    // Compare the output of two backends
-    GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);

    // Tensor initialization
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -18,46 +18,34 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16

 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
-GGML_API void   ggml_init_cublas(void);
+GGML_API GGML_CALL void   ggml_init_cublas(void);

 // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API bool   ggml_cublas_loaded(void);
+GGML_API GGML_CALL bool   ggml_cublas_loaded(void);

-GGML_API void * ggml_cuda_host_malloc(size_t size);
-GGML_API void   ggml_cuda_host_free(void * ptr);
+GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
+GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);

-GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
-GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
+GGML_API GGML_CALL bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API GGML_CALL bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);

-GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
-GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_set_main_device(int main_device);
-GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
-GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
-GGML_API void   ggml_cuda_free_scratch(void);
-GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-
-GGML_API int    ggml_cuda_get_device_count(void);
-GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL int    ggml_cuda_get_device_count(void);
+GGML_API GGML_CALL void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);

-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
-GGML_API int  ggml_backend_cuda_get_device(ggml_backend_t backend);
+GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);

-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+// split tensor buffer that splits matrices by rows across multiple devices
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);

-// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);

 #ifdef  __cplusplus
 }
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -5,6 +5,7 @@
 // GGML internal header

 #include <assert.h>
+#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
@ -18,6 +19,7 @@ extern "C" {
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
+#ifndef __cplusplus
 #ifndef static_assert
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
@ -25,6 +27,7 @@ extern "C" {
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
+#endif

 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
@ -227,6 +230,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_HASHTABLE_FULL ((size_t)-1)
 #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)

+struct ggml_hash_set ggml_hash_set_new(size_t size);
+
 bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);

 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@ -0,0 +1,46 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_vk_device {
+    int index;
+    int type; // same as VkPhysicalDeviceType
+    size_t heapSize;
+    const char * name;
+    const char * vendor;
+    int subgroupSize;
+    uint64_t bufferAlignment;
+    uint64_t maxAlloc;
+};
+
+struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
+bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
+bool ggml_vk_has_vulkan(void);
+bool ggml_vk_has_device(void);
+struct ggml_vk_device ggml_vk_current_device(void);
+
+//
+// backend API
+//
+
+// forward declaration
+typedef struct ggml_backend * ggml_backend_t;
+
+GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
+
+GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
+
+#ifdef __cplusplus
+}
+#endif
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -27,7 +27,6 @@

 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
-#define GGML_METAL_MAX_COMMAND_BUFFERS 32

 struct ggml_tensor;
 struct ggml_cgraph;
@ -36,76 +35,31 @@ struct ggml_cgraph;
 extern "C" {
 #endif

-//
-// internal API
-// temporary exposed to user-code
-//
-
-struct ggml_metal_context;
-
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
-// number of command buffers to use
-struct ggml_metal_context * ggml_metal_init(int n_cb);
-void ggml_metal_free(struct ggml_metal_context * ctx);
-
-void * ggml_metal_host_malloc(size_t n);
-void   ggml_metal_host_free  (void * data);
-
-// set the number of command buffers to use
-void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
-
-// creates a mapping between a host memory buffer and a device memory buffer
-// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
-// - the mapping is used during computation to determine the arguments of the compute kernels
-// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
-// - max_size specifies the maximum size of a tensor and is used to create shared views such
-//   that it is guaranteed that the tensor will fit in at least one of the views
-//
-bool ggml_metal_add_buffer(
-        struct ggml_metal_context * ctx,
-                       const char * name,
-                             void * data,
-                           size_t   size,
-                           size_t   max_size);
-
-// set data from host memory into the device
-void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// get data from the device into host memory
-void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
-
-// try to find operations that can be run concurrently in the graph
-// you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
-
-// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
-int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
-
-// output the concur_list for ggml_alloc
-int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
-
-// same as ggml_graph_compute but uses Metal
-// creates gf->n_threads command buffers in parallel
-void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
-
 //
 // backend API
 // user-code should use only these functions
 //

+GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
 GGML_API ggml_backend_t ggml_backend_metal_init(void);

 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

+GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
-GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
 GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);

+// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
+GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
+
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1,5 +1,6 @@
 #include "ggml.h"
 #include "ggml-opencl.h"
+#include "ggml-backend-impl.h"

 #include <array>
 #include <atomic>
@ -10,7 +11,7 @@
 #include <sstream>
 #include <vector>

-#define CL_TARGET_OPENCL_VERSION 110
+#define CL_TARGET_OPENCL_VERSION 120
 #include <clblast.h>

 #if defined(_MSC_VER)
@ -713,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
        dst[row] = tmp[0];
    }
 }
-
 );


@ -783,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
        dst[row] = tmp[0];
    }
 }
+
 );


@ -798,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
 }
 );

+std::string add_template = MULTILINE_QUOTE(
+__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
+    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
+
+    if (i >= get_global_size(0)) {
+        return;
+    }
+
+    dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
+}
+);
+
 #define CL_CHECK(err)                                               \
    do {                                                            \
        cl_int err_ = (err);                                        \
@ -877,6 +890,7 @@ static std::string generate_kernels() {
        }
        src << mul_kernel << '\n';
    }
+    src << add_template << '\n';

    return src.str();
 }
@ -892,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
 static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
 static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
 static cl_kernel mul_f32_cl;
+static cl_kernel add_f32_cl;
 static bool fp16_support;

 static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@ -929,6 +944,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
 }

 void ggml_cl_init(void) {
+    static bool initialized = false;
+    if (initialized) {
+        return;
+    }
+    initialized = true;
+
    cl_int err;

    struct cl_device;
@ -1093,9 +1114,10 @@ void ggml_cl_init(void) {
    char *ext_buffer = (char *)alloca(ext_str_size + 1);
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
+    // Disabled due to faulty outputs
    // Check if ext_buffer contains cl_khr_fp16
-    fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
+    fp16_support = false;  // strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");

    cl_context_properties properties[] = {
        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
@ -1143,6 +1165,8 @@ void ggml_cl_init(void) {

    // mul kernel
    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
+
+    CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
 }

 static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@ -1451,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
    ggml_cl_mul_f32(src0, src1, dst);
 }

+static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    size_t x_size;
+    size_t d_size;
+
+    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
+    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
+    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
+
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            cl_event ev;
+
+            // copy src0 to device
+            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
+
+            const int64_t i13 = i03%ne13;
+            const int64_t i12 = i02%ne12;
+            const int i1 = i13*ne12*ne11 + i12*ne11;
+
+            cl_int x_offset = 0;
+            cl_int y_offset = i1*ne10;
+            cl_int d_offset = 0;
+
+            size_t global = ne00 * ne01;
+            cl_int ky = ne10 * ne11;
+
+            CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
+            CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
+            CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
+
+            CL_CHECK(clReleaseEvent(ev));
+            CL_CHECK(clFinish(queue));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
+        }
+    }
+    ggml_cl_pool_free(d_X, x_size);
+    ggml_cl_pool_free(d_D, d_size);
+}
+
+void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    ggml_cl_add_f32(src0, src1, dst);
+}
+
 static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@ -1483,8 +1571,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
    }
-    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+    cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_D =  dst->backend == GGML_BACKEND_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);

    size_t x_offset = 0;

@ -1501,7 +1589,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr

                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
                    // copy src1 to device
-                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+                    if (src1->backend == GGML_BACKEND_CPU) {
+                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
+                    }

                    CL_CHECK(clFinish(queue));

@ -1522,8 +1612,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                    }

                    // copy dst to host
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
+                    }
                }
            }
        }
@ -1532,8 +1624,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    if (src0->backend != GGML_BACKEND_GPU) {
        ggml_cl_pool_free(d_X, x_size);
    }
-    ggml_cl_pool_free(d_Y, y_size);
-    ggml_cl_pool_free(d_D, d_size);
+    if (src1->backend != GGML_BACKEND_GPU) {
+        ggml_cl_pool_free(d_Y, y_size);
+    }
+    if (dst->backend != GGML_BACKEND_GPU) {
+        ggml_cl_pool_free(d_D, d_size);
+    }
 }

 static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@ -1598,6 +1694,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                }

+                // FIXME: convert on device
+
                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
                    // convert src1 to fp16
                    // TODO: use multiple threads
@ -1643,11 +1741,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                    }

                    // copy dst to host, then convert to float
-                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-
-                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-
-                    ggml_fp16_to_fp32_row(tmp, d, d_ne);
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                        ggml_fp16_to_fp32_row(tmp, d, d_ne);
+                    } else {
+                        // FIXME: convert dst to fp32 on device
+                    }
                }
            }
        }
@ -1801,7 +1901,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
 }


-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];
@ -1895,3 +1995,302 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
    tensor->extra = dst;
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
+
+// ggml-backend
+
+// buffer
+
+struct ggml_backend_opencl_buffer_context {
+    ~ggml_backend_opencl_buffer_context() {
+        if (buffer) {
+            clReleaseMemObject(buffer);
+        }
+        for (auto * sub_buffer : sub_buffers) {
+            clReleaseMemObject(sub_buffer);
+        }
+    }
+
+    cl_mem buffer;
+    std::vector<cl_mem> sub_buffers;
+};
+
+static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
+
+static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
+    return "OpenCL";
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
+    return cl_ptr_base;
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        tensor->extra = tensor->view_src->extra;
+    } else {
+        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+        cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
+        cl_int err;
+        cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
+        CL_CHECK(err);
+        ctx->sub_buffers.push_back(sub_buffer);
+        tensor->extra = sub_buffer;
+    }
+    tensor->backend = GGML_BACKEND_GPU;
+}
+
+static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+    cl_mem tensor_buffer = (cl_mem) tensor->extra;
+    CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+    cl_mem tensor_buffer = (cl_mem) tensor->extra;
+    CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
+    CL_CHECK(clFinish(queue));
+}
+
+static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
+    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
+    for (auto * sub_buffer : ctx->sub_buffers) {
+        clReleaseMemObject(sub_buffer);
+    }
+    ctx->sub_buffers.clear();
+}
+
+static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
+    /* .get_name        = */ ggml_backend_opencl_buffer_get_name,
+    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
+    /* .cpy_tensor      = */ NULL,
+    /* .clear           = */ ggml_backend_opencl_buffer_clear,
+    /* .reset           = */ ggml_backend_opencl_buffer_reset,
+};
+
+// buffer type
+
+static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
+    return "OpenCL";
+
+    GGML_UNUSED(buffer_type);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
+    ggml_cl_init();
+
+    cl_int err;
+    cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
+    if (err != CL_SUCCESS) {
+        fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
+        return nullptr;
+    }
+
+    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
+
+    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
+    // FIXME: not thread safe, device may not be initialized yet
+    static cl_uint alignment = -1;
+    if (alignment == (cl_uint)-1) {
+        ggml_cl_init();
+        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
+    }
+    return alignment;
+
+    GGML_UNUSED(buffer_type);
+}
+
+static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
+    static size_t max_size = -1;
+    if (max_size == (size_t)-1) {
+        ggml_cl_init();
+        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
+    }
+    return max_size;
+}
+
+static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
+    //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
+    return ggml_backend_is_cpu(backend);
+
+    GGML_UNUSED(buffer_type);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
+    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
+    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
+    /* .get_alloc_size   = */ NULL,
+    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
+    /* .is_host          = */ NULL,
+};
+
+
+ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
+    static ggml_backend_buffer_type buffer_type = {
+        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
+        /* .context = */ nullptr,
+    };
+
+    return &buffer_type;
+}
+
+#if 0
+// host buffer type
+
+static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return "CL_Host";
+
+    GGML_UNUSED(buft);
+}
+
+static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return "CL_Host";
+
+    GGML_UNUSED(buffer);
+}
+
+static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_cl_host_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_cl_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
+    buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
+        /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
+            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_opencl_buffer_type_host;
+}
+
+// backend
+
+static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
+    return "OpenCL";
+
+    GGML_UNUSED(backend);
+}
+
+static void ggml_backend_opencl_free(ggml_backend_t backend) {
+    GGML_UNUSED(backend);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
+    return ggml_backend_opencl_buffer_type();
+
+    GGML_UNUSED(backend);
+}
+
+static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
+    for (int i = 0; i < graph->n_nodes; ++i) {
+        ggml_tensor * node = graph->nodes[i];
+        switch (node->op) {
+            case GGML_OP_MUL_MAT:
+                ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
+                break;
+            case GGML_OP_MUL:
+                ggml_cl_mul(node->src[0], node->src[1], node);
+                break;
+            default:
+                GGML_ASSERT(false);
+        }
+    }
+
+    return true;
+
+    GGML_UNUSED(backend);
+}
+
+static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
+        case GGML_OP_MUL:
+            // return ggml_can_repeat_rows(op->src[1], op->src[0]);
+            return true;
+        default:
+            return false;
+    }
+
+    GGML_UNUSED(backend);
+}
+
+static ggml_backend_i opencl_backend_i = {
+    /* .get_name                = */ ggml_backend_opencl_name,
+    /* .free                    = */ ggml_backend_opencl_free,
+    /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
+    /* .set_tensor_async        = */ NULL,
+    /* .get_tensor_async        = */ NULL,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
+    /* .graph_plan_free         = */ NULL,
+    /* .graph_plan_compute      = */ NULL,
+    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
+    /* .supports_op             = */ ggml_backend_opencl_supports_op,
+};
+
+ggml_backend_t ggml_backend_opencl_init() {
+    ggml_backend_t backend = new ggml_backend {
+        /* .interface = */ opencl_backend_i,
+        /* .context   = */ nullptr
+    };
+
+    return backend;
+}
+
+bool ggml_backend_is_opencl(ggml_backend_t backend) {
+    return backend && backend->iface.get_name == ggml_backend_opencl_name;
+}
+#endif
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -1,24 +1,35 @@
 #pragma once

 #include "ggml.h"
+#include "ggml-backend.h"

 #ifdef  __cplusplus
 extern "C" {
 #endif

-void ggml_cl_init(void);
+GGML_API void ggml_cl_init(void);

-void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
+GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

-void * ggml_cl_host_malloc(size_t size);
-void   ggml_cl_host_free(void * ptr);
+// GGML_API void * ggml_cl_host_malloc(size_t size);
+// GGML_API void   ggml_cl_host_free(void * ptr);

-void ggml_cl_free_data(const struct ggml_tensor* tensor);
+GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);

-void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
+
+// backend API
+
+// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
+
+// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
+// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);

 #ifdef  __cplusplus
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
-// Effectively 2.5625 bits per weight
+// Effectively 2.625 bits per weight
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
@ -165,60 +165,123 @@ typedef struct {
 } block_q8_K;
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

+// (Almost) "true" 2-bit quantization.
+// Due to the need to use blocks as per ggml design, it ends up using
+// 2.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_fp16_t d;
+    uint16_t qs[QK_K/8];
+} block_iq2_xxs;
+static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
+
+// 2.3125 bpw quants
+typedef struct {
+    ggml_fp16_t d;
+    uint16_t qs[QK_K/8];
+    uint8_t  scales[QK_K/32];
+} block_iq2_xs;
+static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
+
+// (Almost) "true" 3-bit quantization.
+// Due to the need to use blocks as per ggml design, it ends up using
+// 3.0625 bpw because of the 16-bit scale for each block of 256.
+typedef struct {
+    ggml_fp16_t d;
+    uint8_t qs[3*QK_K/8];
+} block_iq3_xxs;
+static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
+
+#ifdef __cplusplus
+extern "C" {
+#endif

 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
+void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
+void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
+void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
+void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
+void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
+void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);

-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
+void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
+void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
+void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
+void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
+void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
+void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);

-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);

-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);

 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);

-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);

 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+//
+// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
+//
+size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+
+void iq2xs_init_impl(int grid_size);
+void iq2xs_free_impl(int grid_size);
+void iq3xs_init_impl(int grid_size);
+void iq3xs_free_impl(int grid_size);
+
+#ifdef __cplusplus
+}
+#endif

-void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
-void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -0,0 +1,29 @@
+//
+//  MIT license
+//  Copyright (C) 2024 Intel Corporation
+//  SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_SYCL_MAX_DEVICES       16
+#define GGML_SYCL_NAME "SYCL"
+
+GGML_API void   ggml_init_sycl(void);
+GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
+GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -0,0 +1,39 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_VK_NAME "Vulkan"
+#define GGML_VK_MAX_DEVICES 16
+
+GGML_API void ggml_vk_init_cpu_assist(void);
+
+GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
+GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
+GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
+GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+#ifdef GGML_VULKAN_CHECK_RESULTS
+void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+#endif
+GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
+GGML_API void ggml_vk_free_cpu_assist(void);
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+
+GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
+GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -187,6 +187,16 @@
 #    define GGML_API
 #endif

+#ifdef GGML_MULTIPLATFORM
+#    if defined(_WIN32)
+#        define GGML_CALL
+#    else
+#        define GGML_CALL __attribute__((__ms_abi__))
+#    endif
+#else
+#    define GGML_CALL
+#endif
+
 // TODO: support for clang
 #ifdef __GNUC__
 #    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@ -218,7 +228,9 @@
 #define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
+#ifndef GGML_MAX_NAME
 #define GGML_MAX_NAME           64
+#endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
@ -255,6 +267,8 @@
 #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
 #elif defined(__GNUC__)
 #define GGML_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define GGML_UNREACHABLE() __assume(0)
 #else
 #define GGML_UNREACHABLE() ((void) 0)
 #endif
@ -303,7 +317,7 @@ extern "C" {

 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@ -337,12 +351,21 @@ extern "C" {
        GGML_TYPE_Q5_K = 13,
        GGML_TYPE_Q6_K = 14,
        GGML_TYPE_Q8_K = 15,
+        GGML_TYPE_IQ2_XXS = 16,
+        GGML_TYPE_IQ2_XS  = 17,
+        GGML_TYPE_IQ3_XXS = 18,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
        GGML_TYPE_COUNT,
    };

+    // precision
+    enum ggml_prec {
+        GGML_PREC_DEFAULT,
+        GGML_PREC_F32,
+    };
+
    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
@ -365,6 +388,9 @@ extern "C" {
        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
+        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
    };

    // available tensor operations:
@ -465,6 +491,8 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
+        GGML_UNARY_OP_HARDSWISH,
+        GGML_UNARY_OP_HARDSIGMOID,

        GGML_UNARY_OP_COUNT,
    };
@ -477,8 +505,15 @@ extern "C" {

    enum ggml_log_level {
        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN = 3,
-        GGML_LOG_LEVEL_INFO = 4
+        GGML_LOG_LEVEL_WARN  = 3,
+        GGML_LOG_LEVEL_INFO  = 4,
+        GGML_LOG_LEVEL_DEBUG = 5
+    };
+
+    enum ggml_tensor_flag {
+        GGML_TENSOR_FLAG_INPUT  = 1,
+        GGML_TENSOR_FLAG_OUTPUT = 2,
+        GGML_TENSOR_FLAG_PARAM  = 4,
    };

    // ggml object
@ -502,7 +537,6 @@ extern "C" {

        struct ggml_backend_buffer * buffer;

-        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                                   // nb[0] = ggml_type_size(type)
@ -515,7 +549,7 @@ extern "C" {
        // op params - allocated as int32_t for alignment
        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

-        bool is_param;
+        int32_t flags;

        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];
@ -534,11 +568,16 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[12];
+        char padding[8];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

+    // Abort callback
+    // If not NULL, called before ggml computation
+    // If it returns true, the computation is aborted
+    typedef bool (*ggml_abort_callback)(void * data);
+
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -548,8 +587,8 @@ extern "C" {
        int n_threads;

        // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
+        ggml_abort_callback abort_callback;
+        void *              abort_callback_data;
    };

    enum ggml_cgraph_eval_order {
@ -635,33 +674,41 @@ extern "C" {
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);

-    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
+    GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN

-    GGML_API int     ggml_blck_size (enum ggml_type type);
-    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
+    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row

-    GGML_API const char * ggml_type_name(enum ggml_type type);
-    GGML_API const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
+    GGML_DEPRECATED(
+    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+    "use ggml_row_size() instead");

-    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+    GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
+    GGML_API GGML_CALL const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API           const char * ggml_op_symbol(enum ggml_op   op);

-    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API           const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name

-    GGML_API bool    ggml_is_quantized(enum ggml_type type);
+    GGML_API GGML_CALL size_t  ggml_element_size(const struct ggml_tensor * tensor);
+
+    GGML_API GGML_CALL bool    ggml_is_quantized(enum ggml_type type);

    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);

-    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars

    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);

@ -722,8 +769,8 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);

    // Context tensor enumeration and lookup
-    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
-    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
+    GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@ -748,7 +795,7 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);

-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);

    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
@ -1000,6 +1047,16 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // hardswish(x) = x * relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardswish(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // hardsigmoid(x) = relu6(x + 3) / 6
+    GGML_API struct ggml_tensor * ggml_hardsigmoid(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@ -1050,6 +1107,12 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // change the precision of a matrix multiplication
+    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
+    GGML_API void ggml_mul_mat_set_prec(
+            struct ggml_tensor * a,
+            enum ggml_prec       prec);
+
    // indirect matrix multiplication
    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
@ -1075,13 +1138,13 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            float                 s);

    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_scale_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            float                 s);

    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set(
@ -1137,22 +1200,16 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    // a -> b, in-place, return view(b)
-    GGML_API struct ggml_tensor * ggml_cpy_inplace(
+    GGML_API struct ggml_tensor * ggml_cast(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            enum   ggml_type      type);

    // make contiguous
    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // make contiguous, in-place
-    GGML_API struct ggml_tensor * ggml_cont_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
    // make contiguous, with new shape
    GGML_API struct ggml_tensor * ggml_cont_1d(
            struct ggml_context * ctx,
@ -1391,7 +1448,7 @@ extern "C" {
            float                 beta_slow);

    // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
+    GGML_CALL void ggml_rope_yarn_corr_dims(
        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);

    // xPos RoPE, in-place, returns view(a)
@ -1449,7 +1506,19 @@ extern "C" {
            int                  p1,
            int                  d0,
            int                  d1,
-            bool                 is_2D);
+            bool                 is_2D,
+            enum ggml_type       dst_type);
+
+    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1);

    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
@ -1825,8 +1894,8 @@ extern "C" {

    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API int               ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API int               ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);

    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@ -2029,10 +2098,28 @@ extern "C" {
            ggml_opt_callback callback,
            void * callback_data);

+    //
+    // tensor flags
+    //
+    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
+    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
+
    //
    // quantization
    //

+    // - ggml_quantize_init can be called multiple times with the same type
+    //   it will only initialize the quantization tables for the first call or after ggml_quantize_free
+    //   automatically called by ggml_quantize_chunk for convenience
+    //
+    // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
+    //   call this at the end of the program to avoid memory leaks
+    //
+    // note: these are thread-safe
+    //
+    GGML_API void ggml_quantize_init(enum ggml_type type);
+    GGML_API void ggml_quantize_free(void);
+
    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@ -2046,7 +2133,12 @@ extern "C" {
    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);

-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    // some quantization type cannot be used without an importance matrix
+    GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
+
+    // calls ggml_quantize_init internally (i.e. can allocate memory)
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
+            int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);

    //
    // gguf
@ -2116,10 +2208,11 @@ extern "C" {
    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

-    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
+    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);

    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
@ -2175,6 +2268,7 @@ extern "C" {
    //

    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx_vnni   (void);
    GGML_API int ggml_cpu_has_avx2       (void);
    GGML_API int ggml_cpu_has_avx512     (void);
    GGML_API int ggml_cpu_has_avx512_vbmi(void);
@ -2189,10 +2283,14 @@ extern "C" {
    GGML_API int ggml_cpu_has_blas       (void);
    GGML_API int ggml_cpu_has_cublas     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_vulkan     (void);
+    GGML_API int ggml_cpu_has_kompute    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_vsx        (void);
+    GGML_API int ggml_cpu_has_matmul_int8(void);

    //
    // Internal types and functions exposed for tests and benchmarks
@ -2206,7 +2304,8 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
-    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                      const void * GGML_RESTRICT y, size_t by, int nrc);

    typedef struct {
        const char      * type_name;
@ -2218,6 +2317,7 @@ extern "C" {
        ggml_from_float_t from_float_reference;
        ggml_vec_dot_t    vec_dot;
        enum ggml_type    vec_dot_type;
+        int64_t           nrows; // number of rows to process simultaneously;
    } ggml_type_traits_t;

    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/models/README.md
+++ b/models/README.md
@ -1,19 +1,16 @@
-## Whisper model files in custom ggml format
+## Whisper model files in custom `ggml` format

-The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27)
+The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L30)
 are converted to custom `ggml` format in order to be able to load them in C/C++.
 Conversion is performed using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script.

-You can either obtain the original models and generate the `ggml` files yourself using the conversion script,
-or you can use the [download-ggml-model.sh](download-ggml-model.sh) script to download the already converted models.
-Currently, they are hosted on the following locations:
+There are three ways to obtain `ggml` models:

- https://huggingface.co/ggerganov/whisper.cpp
- https://ggml.ggerganov.com
+### 1. Use [download-ggml-model.sh](download-ggml-model.sh) to download pre-converted models

-Sample download:
+Example download:

-```java
+```text
 $ ./download-ggml-model.sh base.en
 Downloading ggml model base.en ...
 models/ggml-base.en.bin          100%[=============================================>] 141.11M  5.41MB/s    in 22s
@ -23,35 +20,46 @@ You can now use it like this:
  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
 ```

-To convert the files yourself, use the convert-pt-to-ggml.py script. Here is an example usage.
-The original PyTorch files are assumed to have been downloaded into ~/.cache/whisper
-Change `~/path/to/repo/whisper/` to the location for your copy of the Whisper source:
-```
+### 2. Manually download pre-converted models
+
+`ggml` models are available from the following locations:
+
+- https://huggingface.co/ggerganov/whisper.cpp/tree/main
+- https://ggml.ggerganov.com
+
+### 3. Convert with [convert-pt-to-ggml.py](convert-pt-to-ggml.py)
+
+Download one of the [models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L30) and generate the `ggml` files using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script.
+
+Example conversion, assuming the original PyTorch files have been downloaded into `~/.cache/whisper`. Change `~/path/to/repo/whisper/` to the location for your copy of the Whisper source:
+
+```bash
 mkdir models/whisper-medium
 python models/convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
 mv ./models/whisper-medium/ggml-model.bin models/ggml-medium.bin
 rmdir models/whisper-medium
 ```

-A third option to obtain the model files is to download them from Hugging Face:
-
-https://huggingface.co/ggerganov/whisper.cpp/tree/main
-
 ## Available models

-| Model     | Disk    | SHA                                        |
-| ---       | ---     | ---                                        |
-| tiny      |  75 MiB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
-| tiny.en   |  75 MiB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
-| base      | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
-| base.en   | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
-| small     | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
-| small.en  | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
-| medium    | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| medium.en | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
-| large-v1  | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
-| large-v2  | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
-| large-v3  | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
+| Model         | Disk    | SHA                                        |
+| ------------- | ------- | ------------------------------------------ |
+| tiny          | 75 MiB  | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
+| tiny.en       | 75 MiB  | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
+| base          | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
+| base.en       | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
+| small         | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
+| small.en      | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
+| small.en-tdrz | 465 MiB | `b6c6e7e89af1a35c08e6de56b66ca6a02a2fdfa1` |
+| medium        | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
+| medium.en     | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
+| large-v1      | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
+| large-v2      | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large-v2-q5_0 | 1.1 GiB | `00e39f2196344e901b3a2bd5814807a769bd1630` |
+| large-v3      | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
+| large-v3-q5_0 | 1.1 GiB | `e6e2ed78495d403bef4b7cff42ef4aaadcfea8de` |
+
+Models are multilingual unless the model name includes `.en`. Models ending in `-q5_0` are [quantized](../README.md#quantization). Models ending in `-tdrz` support local diarization (marking of speaker turns) using [tinydiarize](https://github.com/akashmjn/tinydiarize). More information about models is available [upstream (openai/whisper)](https://github.com/openai/whisper#available-models-and-languages). The list above is a subset of the models supported by the [download-ggml-model.sh](download-ggml-model.sh) script, but many more are available at https://huggingface.co/ggerganov/whisper.cpp/tree/main and elsewhere.

 ## Model files for testing purposes

--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -143,20 +143,7 @@ class AudioEncoderANE(AudioEncoder):
            x = block(x)

        x = self.ln_post(x)
-
-        # """
-        # TODO:
-        # I think we need to transpose the result here to make it fit whisper.cpp memory order.
-        # However, even doing this, the results are still wrong. Kind of less wrong compared to
-        # not transposing, but still wrong.
-
-        # Also, I don't know why the original OpenAI implementation does not need to transpose
-
-        # transpose to (batch_size, n_ctx, n_state)
-        # x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
-
-        # """
-        # x = x.transpose(1,3)
+        x = x.squeeze(2).transpose(1, 2)

        return x

--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh

 # This script downloads Whisper model files that have already been converted to Core ML format.
 # This way you don't have to convert them yourself.
@ -7,32 +7,32 @@ src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
 pfx="resolve/main/ggml"

 # get the path of this script
-function get_script_path() {
+get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
-        echo "$(dirname $(realpath $0))"
+       dirname "$(realpath "$0")"
    else
-        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
-        echo "$ret"
+        _ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 || exit ; pwd -P)"
+        echo "$_ret"
    fi
 }

 models_path="$(get_script_path)"

 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
+models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3"

 # list available models
-function list_models {
-    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
-    done
-    printf "\n\n"
+list_models() {
+        printf "\n"
+        printf "  Available models:"
+        for model in $models; do
+                printf " %s" "$models"
+        done
+        printf "\n\n"
 }

 if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
+    printf "Usage: %s <model>\n" "$0"
    list_models

    exit 1
@ -40,8 +40,8 @@ fi

 model=$1

-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
+if ! echo "$models" | grep -q -w "$model"; then
+    printf "Invalid model: %s\n" "$model"
    list_models

    exit 1
@ -49,19 +49,19 @@ fi

 # download Core ML model

-printf "Downloading Core ML model $model from '$src' ...\n"
+printf "Downloading Core ML model %s from '%s' ...\n" "$model" "$src"

-cd $models_path
+cd "$models_path" || exit

 if [ -f "ggml-$model.mlmodel" ]; then
-    printf "Model $model already exists. Skipping download.\n"
+    printf "Model %s already exists. Skipping download.\n" "$model"
    exit 0
 fi

 if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
+    wget --quiet --show-progress -O ggml-"$model".mlmodel $src/$pfx-"$model".mlmodel
 elif [ -x "$(command -v curl)" ]; then
-    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
+    curl -L --output ggml-"$model".mlmodel $src/$pfx-"$model".mlmodel
 else
    printf "Either wget or curl is required to download models.\n"
    exit 1
@ -69,14 +69,14 @@ fi


 if [ $? -ne 0 ]; then
-    printf "Failed to download Core ML model $model \n"
+    printf "Failed to download Core ML model %s \n" "$model"
    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
    exit 1
 fi

-printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
+printf "Done! Model '%s' saved in 'models/ggml-%s.mlmodel'\n" "$model" "$model"
 printf "Run the following command to compile it:\n\n"
-printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
+printf "  $ xcrun coremlc compile ./models/ggml-%s.mlmodel ./models\n\n" "$model"
 printf "You can now use it like this:\n\n"
-printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
+printf "  $ ./main -m models/ggml-%s.bin -f samples/jfk.wav\n" "$model"
 printf "\n"
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh

 # This script downloads Whisper model files that have already been converted to ggml format.
 # This way you don't have to convert them yourself.
@ -9,103 +9,114 @@
 src="https://huggingface.co/ggerganov/whisper.cpp"
 pfx="resolve/main/ggml"

+BOLD="\033[1m"
+RESET='\033[0m'
+
 # get the path of this script
-function get_script_path() {
+get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
-        echo "$(dirname "$(realpath "$0")")"
+        dirname "$(realpath "$0")"
    else
-        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
-        echo "$ret"
+        _ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 || exit ; pwd -P)"
+        echo "$_ret"
    fi
 }

-models_path="$(get_script_path)"
+models_path="${2:-$(get_script_path)}"

 # Whisper models
-models=(
-    "tiny.en"
-    "tiny"
-    "tiny-q5_1"
-    "tiny.en-q5_1"
-    "base.en"
-    "base"
-    "base-q5_1"
-    "base.en-q5_1"
-    "small.en"
-    "small.en-tdrz"
-    "small"
-    "small-q5_1"
-    "small.en-q5_1"
-    "medium"
-    "medium.en"
-    "medium-q5_0"
-    "medium.en-q5_0"
-    "large-v1"
-    "large-v2"
-    "large-v3"
-    "large-q5_0"
-)
+models="tiny
+tiny.en
+tiny-q5_1
+tiny.en-q5_1
+base
+base.en
+base-q5_1
+base.en-q5_1
+small
+small.en
+small.en-tdrz
+small-q5_1
+small.en-q5_1
+medium
+medium.en
+medium-q5_0
+medium.en-q5_0
+large-v1
+large-v2
+large-v2-q5_0
+large-v3
+large-v3-q5_0"

 # list available models
-function list_models {
+list_models() {
    printf "\n"
-    printf "  Available models:"
-    for model in "${models[@]}"; do
-        printf " $model"
+    printf "Available models:"
+    model_class=""
+    for model in $models; do
+        this_model_class="${model%%[.-]*}"
+        if [ "$this_model_class" != "$model_class" ]; then
+            printf "\n "
+            model_class=$this_model_class
+        fi
+        printf " %s" "$model"
    done
    printf "\n\n"
 }

-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
+if [ "$#" -lt 1 ] || [ "$#" -gt 2 ]; then
+    printf "Usage: %s <model> [models_path]\n" "$0"
    list_models
+    printf "___________________________________________________________\n"
+    printf "${BOLD}.en${RESET} = english-only ${BOLD}-q5_[01]${RESET} = quantized ${BOLD}-tdrz${RESET} = tinydiarize\n"

    exit 1
 fi

 model=$1

-if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
-    printf "Invalid model: $model\n"
+if ! echo "$models" | grep -q -w "$model"; then
+    printf "Invalid model: %s\n" "$model"
    list_models

    exit 1
 fi

 # check if model contains `tdrz` and update the src and pfx accordingly
-if [[ $model == *"tdrz"* ]]; then
+if echo "$model" | grep -q "tdrz"; then
    src="https://huggingface.co/akashmjn/tinydiarize-whisper.cpp"
    pfx="resolve/main/ggml"
 fi

+echo "$model" | grep -q '^"tdrz"*$'
+
 # download ggml model

-printf "Downloading ggml model $model from '$src' ...\n"
+printf "Downloading ggml model %s from '%s' ...\n" "$model" "$src"

-cd "$models_path"
+cd "$models_path" || exit

 if [ -f "ggml-$model.bin" ]; then
-    printf "Model $model already exists. Skipping download.\n"
+    printf "Model %s already exists. Skipping download.\n" "$model"
    exit 0
 fi

 if [ -x "$(command -v wget)" ]; then
-    wget --no-config --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
+    wget --no-config --quiet --show-progress -O ggml-"$model".bin $src/$pfx-"$model".bin
 elif [ -x "$(command -v curl)" ]; then
-    curl -L --output ggml-$model.bin $src/$pfx-$model.bin
+    curl -L --output ggml-"$model".bin $src/$pfx-"$model".bin
 else
    printf "Either wget or curl is required to download models.\n"
    exit 1
 fi

-
 if [ $? -ne 0 ]; then
-    printf "Failed to download ggml model $model \n"
+    printf "Failed to download ggml model %s \n" "$model"
    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
    exit 1
 fi

-printf "Done! Model '$model' saved in 'models/ggml-$model.bin'\n"
+printf "Done! Model '%s' saved in '%s/ggml-%s.bin'\n" "$model" "$models_path" "$model"
 printf "You can now use it like this:\n\n"
-printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
+printf "  $ ./main -m %s/ggml-%s.bin -f samples/jfk.wav\n" "$models_path" "$model"
 printf "\n"
--- a/models/generate-coreml-interface.sh
+++ b/models/generate-coreml-interface.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh
 #
 # This generates:
 #   - coreml/whisper-encoder-impl.h and coreml/whisper-encoder-impl.m
@ -6,7 +6,7 @@
 #

 wd=$(dirname "$0")
-cd "$wd/../"
+cd "$wd/../" || exit

 python3 models/convert-whisper-to-coreml.py --model tiny.en

--- a/models/generate-coreml-model.sh
+++ b/models/generate-coreml-model.sh
@ -1,4 +1,4 @@
-#!/bin/bash
+#!/bin/sh

 # Usage: ./generate-coreml-model.sh <model-name>
 if [ $# -eq 0 ]; then
@ -6,7 +6,7 @@ if [ $# -eq 0 ]; then
  echo "Usage for Whisper models: ./generate-coreml-model.sh <model-name>"
  echo "Usage for HuggingFace models: ./generate-coreml-model.sh -h5 <model-name> <model-path>"
  exit 1
-elif [[ "$1" == "-h5" && $# != 3 ]]; then
+elif [ "$1" = "-h5" ] && [ $# != 3 ]; then
  echo "No model name and model path supplied for a HuggingFace model"
  echo "Usage for HuggingFace models: ./generate-coreml-model.sh -h5 <model-name> <model-path>"
  exit 1
@ -15,20 +15,20 @@ fi
 mname="$1"

 wd=$(dirname "$0")
-cd "$wd/../"
+cd "$wd/../" || exit

-if [[ $mname == "-h5" ]]; then
+if [ "$mname" = "-h5" ]; then
  mname="$2"
  mpath="$3"
-  echo $mpath
-  python3 models/convert-h5-to-coreml.py --model-name $mname --model-path $mpath --encoder-only True
+  echo "$mpath"
+  python3 models/convert-h5-to-coreml.py --model-name "$mname" --model-path "$mpath" --encoder-only True
 else
-  python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+  python3 models/convert-whisper-to-coreml.py --model "$mname" --encoder-only True --optimize-ane True
 fi

-xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
-rm -rf models/ggml-${mname}-encoder.mlmodelc
-mv -v models/coreml-encoder-${mname}.mlmodelc models/ggml-${mname}-encoder.mlmodelc
+xcrun coremlc compile models/coreml-encoder-"${mname}".mlpackage models/
+rm -rf models/ggml-"${mname}"-encoder.mlmodelc
+mv -v models/coreml-encoder-"${mname}".mlmodelc models/ggml-"${mname}"-encoder.mlmodelc

 # TODO: decoder (sometime in the future maybe)
 #xcrun coremlc compile models/whisper-decoder-${mname}.mlpackage models/
--- a/openvino/whisper-openvino-encoder.cpp
+++ b/openvino/whisper-openvino-encoder.cpp
@ -64,15 +64,15 @@ int whisper_openvino_encode(
        return 0;
    }

-    if (mel->n_dims != 2) {
+    if (ggml_n_dims(mel) != 2) {
        fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
-            __func__, mel->n_dims);
+            __func__, ggml_n_dims(mel));
        return 0;
    }

-    if (out->n_dims != 2) {
+    if (ggml_n_dims(out) != 2) {
        fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
-            __func__, out->n_dims);
+            __func__, ggml_n_dims(out));
        return 0;
    }

@ -105,4 +105,4 @@ int whisper_openvino_encode(
    }

    return 1;
-}
+}
--- a/spm-headers/ggml.h
+++ b/spm-headers/ggml.h
@ -1 +0,0 @@
-../ggml.h
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -86,6 +86,7 @@ extern "C" {

    struct whisper_context_params {
        bool  use_gpu;
+        int   gpu_device;  // CUDA device
    };

    typedef struct whisper_token_data {
@ -411,11 +412,6 @@ extern "C" {
    // If it returns false, the computation is aborted
    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);

-    // Abort callback
-    // If not NULL, called before ggml computation
-    // If it returns true, the computation is aborted
-    typedef bool (*whisper_abort_callback)(void * user_data);
-
    // Logits filter callback
    // Can be used to modify the logits before sampling
    // If not NULL, called after applying temperature to logits
@ -512,7 +508,7 @@ extern "C" {
        void * encoder_begin_callback_user_data;

        // called each time before ggml computation starts
-        whisper_abort_callback abort_callback;
+        ggml_abort_callback abort_callback;
        void * abort_callback_user_data;

        // called by each decoder to filter obtained logits