release : v1.5.4

fix : cuda order of synchronization when setting a buffer (ggml/679)
* fix : cuda order of synchronization when setting a buffer * also sync before memcpy --------- Co-authored-by: slaren <slarengh@gmail.com>
2025-07-02 15:30:28 +02:00 · 2024-01-05 17:11:27 +02:00 · 2024-01-05 17:01:59 +02:00 · 2024-01-05 16:31:30 +02:00 · 2024-01-05 16:31:20 +02:00 · 2024-01-04 16:28:30 +02:00
45 changed files with 5254 additions and 2748 deletions
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -0,0 +1,38 @@
+ARG UBUNTU_VERSION=22.04
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=12.3.1
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+# Target the CUDA runtime image
+ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} AS build
+WORKDIR /app
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV WHISPER_CUBLAS=1
+
+RUN apt-get update && \
+    apt-get install -y build-essential \
+    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+# Ref: https://stackoverflow.com/a/53464012
+ENV CUDA_MAIN_VERSION=12.3
+ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
+
+COPY .. .
+RUN make
+
+FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
+WORKDIR /app
+
+RUN apt-get update && \
+  apt-get install -y curl ffmpeg \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY --from=build /app /app
+ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -0,0 +1,19 @@
+FROM ubuntu:22.04 AS build
+WORKDIR /app
+
+RUN apt-get update && \
+  apt-get install -y build-essential \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY .. .
+RUN make
+
+FROM ubuntu:22.04 AS runtime
+WORKDIR /app
+
+RUN apt-get update && \
+  apt-get install -y curl ffmpeg \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY --from=build /app /app
+ENTRYPOINT [ "bash", "-c" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -117,7 +117,6 @@ jobs:
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            apt update
-            apt install -y clang
            apt install -y clang build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
            make
@ -167,7 +166,7 @@ jobs:
            s2arc: x64
            jnaPath: win32-x86-64
          - sdl2: ON
-            s2ver: 2.26.0
+            s2ver: 2.28.5

    steps:
      - name: Clone
@ -224,11 +223,14 @@ jobs:
          - arch: Win32
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
            s2arc: x86
+            clblast: OFF
          - arch: x64
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
            s2arc: x64
+            clblast: ON
+            clver: 1.6.1
          - sdl2: ON
-            s2ver: 2.26.0
+            s2ver: 2.28.5

    steps:
      - name: Clone
@ -253,6 +255,18 @@ jobs:
          7z x sdl2.zip
          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV

+      - name: Install OpenCL
+        if: matrix.clblast == 'ON'
+        run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
+
+      - name: Fetch CLBlast and set CLBlast_DIR
+        if: matrix.clblast == 'ON'
+        run: |
+          C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
+          7z x clblast.zip
+          7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
+          echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
+
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
@ -260,6 +274,7 @@ jobs:
          -DWHISPER_OPENBLAS=${{ matrix.blas }}
          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_CLBLAST=${{ matrix.clblast }}

      - name: Build
        run: |
@ -274,11 +289,15 @@ jobs:
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

+      - name: Copy clblast.dll
+        if: matrix.clblast == 'ON'
+        run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
+
      - name: Upload binaries
        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v1
        with:
-          name: whisper-blas-bin-${{ matrix.arch }}
+          name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}

  windows-cublas:
@ -295,7 +314,7 @@ jobs:
          - arch: x64
            s2arc: x64
          - sdl2: ON
-            s2ver: 2.26.0
+            s2ver: 2.28.5

    steps:
      - name: Clone
@ -321,7 +340,8 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_CUBLAS=1
+          -DWHISPER_CUBLAS=${{ matrix.cublas }}
+          -DWHISPER_SDL2=${{ matrix.sdl2 }}

      - name: Build ${{ matrix.cuda-toolkit }}
        run: |
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -0,0 +1,57 @@
+name: Publish Docker image
+
+on:
+  pull_request:
+  push:
+    branches:
+      - master
+
+jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    if: github.event.pull_request.draft == false
+
+    runs-on: ubuntu-latest
+    env:
+      COMMIT_SHA: ${{ github.sha }}
+    strategy:
+      matrix:
+        config:
+          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
+          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v3
+
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and push Docker image (versioned)
+        if: github.event_name == 'push'
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          platforms: ${{ matrix.config.platforms }}
+          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          file: ${{ matrix.config.dockerfile }}
+
+      - name: Build and push Docker image (tagged)
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          push: ${{ github.event_name == 'push' }}
+          platforms: ${{ matrix.config.platforms }}
+          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
+          file: ${{ matrix.config.dockerfile }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.5)

-project(whisper.cpp VERSION 1.5.2)
+project(whisper.cpp VERSION 1.5.4)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -218,11 +218,17 @@ if (WHISPER_CUBLAS)
        add_compile_definitions(GGML_USE_CUBLAS)

        if (WHISPER_STATIC)
-            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            if (WIN32)
+                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
+                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+            else ()
+                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            endif()
        else()
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

+        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
    else()
        message(FATAL_ERROR "cuBLAS not found")
    endif()
@ -338,8 +344,8 @@ else()
        endif()
    else()
        if (EMSCRIPTEN)
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
        else()
            if(NOT WHISPER_NO_AVX)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
@ -521,7 +527,13 @@ endif()

 if (GGML_SOURCES_CUDA)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
+    # Only configure gmml CUDA architectures is not globally set
+    if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
+        # Not overriden by user, so set defaults
+        set(GGML_CUDA_ARCHITECTURES 52 61 70)
+    endif()
+    message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
+    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
 endif()

--- a/2
+++ b/2
@ -206,7 +206,7 @@ ifdef WHISPER_CUBLAS

 	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
+	LDFLAGS     += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
 	WHISPER_OBJ += ggml-cuda.o
 	NVCC        = nvcc
 	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
--- a/Package.swift
+++ b/Package.swift
@ -13,9 +13,13 @@ let package = Package(
    products: [
        .library(name: "whisper", targets: ["whisper"]),
    ],
+    dependencies: [
+        .package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
+    ],
    targets: [
        .target(
            name: "whisper",
+            dependencies: ["ggml"],
            path: ".",
            exclude: [
               "bindings",
@ -32,14 +36,8 @@ let package = Package(
               "Makefile"
            ],
            sources: [
-                "ggml.c",
                "whisper.cpp",
-                "ggml-alloc.c",
-                "ggml-backend.c",
-                "ggml-quants.c",
-                "ggml-metal.m"
            ],
-            resources: [.process("ggml-metal.metal")],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.5.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.5.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -33,6 +33,7 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
+- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)

 The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
 The rest of the code is part of the [ggml](https://github.com/ggerganov/ggml) machine learning library.
@ -448,6 +449,36 @@ make clean
 WHISPER_OPENBLAS=1 make -j
 ```

+## Docker
+
+### Prerequisites
+* Docker must be installed and running on your system.
+* Create a folder to store big models & intermediate files (ex. /whisper/models)
+
+### Images
+We have two Docker images available for this project:
+
+1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
+
+### Usage
+
+```shell
+# download model and persist it in a local folder
+docker run -it --rm \
+  -v path/to/models:/models \
+  whisper.cpp:main "./models/download-ggml-model.sh base /models"
+# transcribe an audio file
+docker run -it --rm \
+  -v path/to/models:/models \
+  -v path/to/audios:/audios \
+  whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
+# transcribe an audio file in samples folder
+docker run -it --rm \
+  -v path/to/models:/models \
+  whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
+```
+
 ## Limitations

 - Inference only
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.5.2",
+  "version": "1.5.4",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@ -70,7 +70,7 @@ extern "C" {
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

        // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
    backend->iface.graph_plan_compute(backend, plan);
 }

-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    return backend->iface.graph_compute(backend, cgraph);
 }

 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@ -52,7 +52,7 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -24,9 +24,9 @@ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {

    // select which device to run the Core ML model on
    MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
-    config.computeUnits = MLComputeUnitsCPUAndGPU;
+    // config.computeUnits = MLComputeUnitsCPUAndGPU;
    //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
-    //config.computeUnits = MLComputeUnitsAll;
+    config.computeUnits = MLComputeUnitsAll;

    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -14,6 +14,10 @@ if (WHISPER_SDL2)
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()

+if (WHISPER_CLBLAST)
+    find_package(CLBlast REQUIRED)
+endif()
+
 # common

 set(TARGET common)
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,30 +1,18 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    #target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+    add_executable(${TARGET} talk-llama.cpp llama.cpp)
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})

-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET}
-        talk-llama.cpp
-        llama.cpp
-        ../common.cpp
-        ../common-sdl.cpp
-        ../../ggml.c
-        ../../ggml-alloc.c
-        ../../ggml-backend.c
-        ../../ggml-quants.c
-        ../../whisper.cpp)
+    if (WHISPER_CLBLAST)
+        set(CLBLAST_LIBNAME clblast)
+    endif ()
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CLBLAST_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})

    if(WIN32)
-    # It requires Windows 8.1 or later for PrefetchVirtualMemory
-    target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
+        # It requires Windows 8.1 or later for PrefetchVirtualMemory
+        target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
    endif()

-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
    include(DefaultTargetOptions)
 endif ()
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -39,10 +39,11 @@

 #define LLAMA_MAX_RNG_STATE (64*1024)

+#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 2
+#define LLAMA_SESSION_VERSION 3

 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
@ -126,7 +127,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;

-    typedef void (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void *ctx);

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -158,16 +159,38 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

+    enum llama_model_kv_override_type {
+        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_BOOL,
+    };
+
+    struct llama_model_kv_override {
+        char key[128];
+        enum llama_model_kv_override_type tag;
+        union {
+            int64_t int_value;
+            double float_value;
+            bool bool_value;
+        };
+    };
+
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

-        // called with a progress value between 0 and 1, pass NULL to disable
+        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+        // If the provided progress_callback returns true, model loading continues.
+        // If it returns false, model loading is immediately aborted.
        llama_progress_callback progress_callback;
+
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

+        // override key-value pairs of the model meta data
+        const struct llama_model_kv_override * kv_overrides;
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
@ -185,17 +208,20 @@ extern "C" {
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
        float    yarn_attn_factor; // YaRN magnitude scaling factor
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size

+        enum ggml_type type_k; // data type for K cache
+        enum ggml_type type_v; // data type for V cache
+
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool embedding;  // embedding mode only
+        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embedding;   // embedding mode only
+        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
    };

    // model quantization parameters
@ -290,7 +316,9 @@ extern "C" {

    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

-    LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);
+    // TODO: become more consistent with returned int types across the API
+    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);

    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);

@ -301,6 +329,23 @@ extern "C" {
    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);

+    // Functions to access the model's GGUF metadata scalar values
+    // - The functions return the length of the string on success, or -1 on failure
+    // - The output string is always null-terminated and cleared on failure
+    // - GGUF array values are not supported by these functions
+
+    // Get metadata value as a string by key name
+    LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
+
+    // Get the number of metadata key/value pairs
+    LLAMA_API int llama_model_meta_count(const struct llama_model * model);
+
+    // Get metadata key name by index
+    LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
+    // Get metadata value as a string by index
+    LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
+
    // Get a string describing the model type
    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

@ -344,9 +389,60 @@ extern "C" {
    // KV cache
    //

-    // Returns the number of tokens in the KV cache
-    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
-            "avoid using this, it will be removed in the future, instead - count the tokens in user code");
+    // Information associated with an individual cell in the KV cache view.
+    struct llama_kv_cache_view_cell {
+        // The position for this cell. Takes KV cache shifts into account.
+        // May be negative if the cell is not populated.
+        llama_pos pos;
+    };
+
+    // An updateable view of the KV cache.
+    struct llama_kv_cache_view {
+        // Number of KV cache cells. This will be the same as the context size.
+        int32_t n_cells;
+
+        // Maximum number of sequences that can exist in a cell. It's not an error
+        // if there are more sequences in a cell than this value, however they will
+        // not be visible in the view cells_sequences.
+        int32_t n_max_seq;
+
+        // Number of tokens in the cache. For example, if there are two populated
+        // cells, the first with 1 sequence id in it and the second with 2 sequence
+        // ids then you'll have 3 tokens.
+        int32_t token_count;
+
+        // Number of populated cache cells.
+        int32_t used_cells;
+
+        // Maximum contiguous empty slots in the cache.
+        int32_t max_contiguous;
+
+        // Index to the start of the max_contiguous slot range. Can be negative
+        // when cache is full.
+        int32_t max_contiguous_idx;
+
+        // Information for an individual cell.
+        struct llama_kv_cache_view_cell * cells;
+
+        // The sequences for each cell. There will be n_max_seq items per cell.
+        llama_seq_id * cells_sequences;
+    };
+
+    // Create an empty KV cache view. (use only for debugging purposes)
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+
+    // Free a KV cache view. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
+
+    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
+    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
+
+    // Returns the number of tokens in the KV cache (slow, use only for debug)
+    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
+
+    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
+    LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);

    // Clear the KV cache
    LLAMA_API void llama_kv_cache_clear(
@ -517,6 +613,12 @@ extern "C" {
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line

+    // Returns -1 if unknown, 1 for true or 0 for false.
+    LLAMA_API int         llama_add_bos_token(const struct llama_model * model);
+
+    // Returns -1 if unknown, 1 for true or 0 for false.
+    LLAMA_API int         llama_add_eos_token(const struct llama_model * model);
+
    // codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -282,7 +282,6 @@ int main(int argc, char ** argv) {
    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
-    lcparams.f16_kv     = true;
    lcparams.n_threads  = params.n_threads;

    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
+        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -524,8 +524,7 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
+                        1.0f/sqrt(float(n_embd)/n_head));

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
+        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
-        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
+        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
+        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
+        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
+        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -525,8 +525,7 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
+                        1.0f/sqrt(float(n_embd)/n_head));

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/wchess/README.md
+++ b/examples/wchess/README.md
@ -33,8 +33,13 @@ White's turn

 ## TODO

- Improve web-browser audio capture - sometimes it does not record the voice properly
- Add support for more languages by making the generated grammar string multi-lingual
 - Fix bugs in the chess moves logic
+- Improve web-browser audio capture - sometimes it does not record the voice properly
+- Add support for more languages by making the generated grammar string multilingual
+- Explore ways to improve the dynamic grammar to be narrower

 PRs welcome!
+
+## Thanks
+
+- [chessboardjs](https://chessboardjs.com) for the neat chessboard JS library used in this demo
--- a/examples/whisper.swiftui/.gitignore
+++ b/examples/whisper.swiftui/.gitignore
@ -0,0 +1,2 @@
+xcuserdata
+xcshareddata
--- a/extra/bench.py
+++ b/extra/bench.py
@ -61,7 +61,9 @@ models = [
    "ggml-small.bin",
    "ggml-medium.en.bin",
    "ggml-medium.bin",
-    "ggml-large.bin",
+    "ggml-large-v1.bin",
+    "ggml-large-v2.bin",
+    "ggml-large-v3.bin",
 ]


--- a/extra/sync-ggml-am.sh
+++ b/extra/sync-ggml-am.sh
@ -0,0 +1,154 @@
+#!/bin/bash
+#
+# Synchronize ggml changes to whisper.cpp
+#
+# Usage:
+#
+#   $ cd /path/to/whisper.cpp
+#   $ ./extra/sync-ggml-am.sh
+#
+
+set -e
+
+sd=$(dirname $0)
+cd $sd/../
+
+SRC_WHISPER=$(pwd)
+SRC_GGML=$(cd ../ggml; pwd)
+
+if [ ! -d $SRC_GGML ]; then
+    echo "ggml not found at $SRC_GGML"
+    exit 1
+fi
+
+lc=$(cat $SRC_WHISPER/extra/sync-ggml.last)
+echo "Syncing ggml changes since commit $lc"
+
+cd $SRC_GGML
+
+git log --oneline $lc..HEAD
+git log --oneline $lc..HEAD --reverse | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits
+
+if [ ! -s $SRC_WHISPER/ggml-commits ]; then
+    rm -v $SRC_WHISPER/ggml-commits
+    echo "No new commits"
+    exit 0
+fi
+
+if [ -f $SRC_WHISPER/ggml-src.patch ]; then
+    rm -v $SRC_WHISPER/ggml-src.patch
+fi
+
+while read c; do
+    git format-patch -k $c~1..$c --stdout -- \
+        include/ggml/ggml*.h \
+        src/ggml*.h \
+        src/ggml*.c \
+        src/ggml*.cpp \
+        src/ggml*.m \
+        src/ggml*.metal \
+        src/ggml*.cu \
+        examples/common.h \
+        examples/common.cpp \
+        examples/common-ggml.h \
+        examples/common-ggml.cpp \
+        examples/whisper/whisper.h \
+        examples/whisper/whisper.cpp \
+        examples/whisper/main.cpp \
+        examples/whisper/quantize.cpp \
+        >> $SRC_WHISPER/ggml-src.patch
+done < $SRC_WHISPER/ggml-commits
+
+rm -v $SRC_WHISPER/ggml-commits
+
+# delete files if empty
+if [ ! -s $SRC_WHISPER/ggml-src.patch ]; then
+    rm -v $SRC_WHISPER/ggml-src.patch
+fi
+
+cd $SRC_WHISPER
+
+if [ -f $SRC_WHISPER/ggml-src.patch ]; then
+    # replace PR numbers
+    #
+    # Subject: some text (#1234)
+    # Subject: some text (ggml/1234)
+    cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
+    mv ggml-src.patch.tmp ggml-src.patch
+
+    cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
+    mv ggml-src.patch.tmp ggml-src.patch
+
+    # replace filenames:
+    #
+    # src/ggml.c                  -> ggml.c
+    # src/ggml-alloc.c            -> ggml-alloc.c
+    # src/ggml-backend-impl.h     -> ggml-backend-impl.h
+    # src/ggml-backend.c          -> ggml-backend.c
+    # src/ggml-cuda.cu            -> ggml-cuda.cu
+    # src/ggml-cuda.h             -> ggml-cuda.h
+    # src/ggml-impl.h             -> ggml-impl.h
+    # src/ggml-metal.h            -> ggml-metal.h
+    # src/ggml-metal.m            -> ggml-metal.m
+    # src/ggml-mpi.h              -> ggml-mpi.h
+    # src/ggml-mpi.c              -> ggml-mpi.c
+    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
+    # src/ggml-opencl.h           -> ggml-opencl.h
+    # src/ggml-quants.c           -> ggml-quants.c
+    # src/ggml-quants.h           -> ggml-quants.h
+    # include/ggml/ggml.h         -> ggml.h
+    # include/ggml/ggml-alloc.h   -> ggml-alloc.h
+    # include/ggml/ggml-backend.h -> ggml-backend.h
+    #
+    # examples/common.h           -> examples/common.h
+    # examples/common.cpp         -> examples/common.cpp
+    # examples/common-ggml.h      -> examples/common-ggml.h
+    # examples/common-ggml.cpp    -> examples/common-ggml.cpp
+    #
+    # examples/whisper/whisper.h    -> whisper.h
+    # examples/whisper/whisper.cpp  -> whisper.cpp
+    # examples/whisper/main.cpp     -> examples/main/main.cpp
+    # examples/whisper/quantize.cpp -> examples/quantize/quantize.cpp
+
+    cat ggml-src.patch | sed \
+        -e 's/src\/ggml\.c/ggml.c/g' \
+        -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
+        -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
+        -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
+        -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
+        -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
+        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
+        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
+        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
+        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
+        -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
+        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
+        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
+        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
+        -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
+        -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
+        -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
+        -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
+        -e 's/examples\/common\.h/examples\/common.h/g' \
+        -e 's/examples\/common\.cpp/examples\/common.cpp/g' \
+        -e 's/examples\/common-ggml\.h/examples\/common-ggml.h/g' \
+        -e 's/examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
+        -e 's/examples\/whisper\/whisper\.h/whisper.h/g' \
+        -e 's/examples\/whisper\/whisper\.cpp/whisper.cpp/g' \
+        -e 's/examples\/whisper\/main\.cpp/examples\/main\/main.cpp/g' \
+        -e 's/examples\/whisper\/quantize\.cpp/examples\/quantize\/quantize.cpp/g' \
+        > ggml-src.patch.tmp
+    mv ggml-src.patch.tmp ggml-src.patch
+
+    git am ggml-src.patch
+
+    rm -v $SRC_WHISPER/ggml-src.patch
+fi
+
+# update last commit
+cd $SRC_GGML
+git log -1 --format=%H > $SRC_WHISPER/extra/sync-ggml.last
+
+echo "Done"
+
+exit 0
--- a/extra/sync-ggml.last
+++ b/extra/sync-ggml.last
@ -0,0 +1 @@
+3eace58911ea8d2cf35defdc59848d99b91a57f5
--- a/extra/sync-llama.sh
+++ b/extra/sync-llama.sh
@ -0,0 +1,5 @@
+#!/bin/bash
+
+cp -rpv ../llama.cpp/llama.h   ./examples/talk-llama/llama.h
+cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
+cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t

 // check if a tensor is allocated by this buffer
 static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
-    return tensor->buffer == alloc->buffer;
+    return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
 }

 static bool ggml_is_view(struct ggml_tensor * t) {
@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
    if (update_backend) {
        view->backend = view->view_src->backend;
    }
-    view->buffer  = view->view_src->buffer;
+    // views are initialized in the alloc buffer rather than the view_src buffer
+    view->buffer  = alloc->buffer;
    view->data    = (char *)view->view_src->data + view->view_offs;

-    // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
-    // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
    assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);

    if (!alloc->measure) {
@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
 }

 void ggml_allocr_free(ggml_allocr_t alloc) {
+    if (alloc == NULL) {
+        return;
+    }
+
    ggml_gallocr_free(alloc->galloc);
    ggml_tallocr_free(alloc->talloc);
    free(alloc);
@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
    }

    if (nbytes == 0) {
-        fprintf(stderr, "%s: no tensors to allocate\n", __func__);
+        // all the tensors in the context are already allocated
        return NULL;
    }

@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
            } else {
                ggml_backend_view_init(buffer, t);
            }
+        } else {
+            if (t->view_src != NULL) {
+                // view of a pre-allocated tensor
+                ggml_backend_view_init(buffer, t);
+            }
        }
    }

--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -20,6 +20,9 @@ extern "C" {
        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        // check if tensor data is in host memory
+        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
+        bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
    };

    struct ggml_backend_buffer_type {
@ -31,15 +34,16 @@ extern "C" {
    typedef void * ggml_backend_buffer_context_t;

    struct ggml_backend_buffer_i {
-        void     (*free_buffer)(ggml_backend_buffer_t buffer);
+        void   (*free_buffer)    (ggml_backend_buffer_t buffer);
        //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-        void *   (*get_base)   (ggml_backend_buffer_t buffer);
-        void     (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void     (*set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void     (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void * (*get_base)       (ggml_backend_buffer_t buffer);
+        void   (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void   (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void   (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
-        void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*clear)          (ggml_backend_buffer_t buffer, uint8_t value);
    };

    struct ggml_backend_buffer {
@ -78,7 +82,7 @@ extern "C" {
        void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
        void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);

-        void (*synchronize)     (ggml_backend_t backend);
+        void (*synchronize)(ggml_backend_t backend);

        // compute graph with a plan
        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
@ -86,7 +90,7 @@ extern "C" {
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

        // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/ggml-backend.c
+++ b/ggml-backend.c
@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
    return buft->iface.supports_backend(buft, backend);
 }

+bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
+    if (buft->iface.is_host) {
+        return buft->iface.is_host(buft);
+    }
+    return false;
+}
+
 // backend buffer

 ggml_backend_buffer_t ggml_backend_buffer_init(
@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
    return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
 }

+void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    buffer->iface.clear(buffer, value);
+}
+
+bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
+    return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
+}
+
 ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
    return buffer->buft;
 }
@ -180,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
    ggml_backend_synchronize(backend);
 }

-void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    backend->iface.graph_compute(backend, cgraph);
+bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    if (!backend->iface.graph_compute(backend, cgraph)) {
+        return false;
+    }

    // TODO: optional sync
    ggml_backend_synchronize(backend);
+    return true;
 }

 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@ -282,7 +300,7 @@ static void ggml_backend_registry_init(void) {
 void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
    GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);

-    int id = ggml_backend_registry_count;
+    size_t id = ggml_backend_registry_count;

    ggml_backend_registry[id] = (struct ggml_backend_reg) {
        /* .name                = */ {0},
@ -315,6 +333,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
            return i;
        }
    }
+
+    // not found
    return SIZE_MAX;
 }

@ -325,15 +345,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
    const char * params = strchr(backend_str, ':');
    char backend_name[128];
    if (params == NULL) {
-        strcpy(backend_name, backend_str);
+        snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
        params = "";
    } else {
-        strncpy(backend_name, backend_str, params - backend_str);
-        backend_name[params - backend_str] = '\0';
+        snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
        params++;
    }

    size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
+
    if (backend_i == SIZE_MAX) {
        fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
        return NULL;
@ -378,22 +398,15 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {

 static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    free(buffer->context);
-    GGML_UNUSED(buffer);
 }

 static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
    memcpy((char *)tensor->data + offset, data, size);

    GGML_UNUSED(buffer);
 }

 static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
    memcpy(data, (const char *)tensor->data + offset, size);

    GGML_UNUSED(buffer);
@ -411,6 +424,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
    GGML_UNUSED(buffer);
 }

+static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    memset(buffer->context, value, buffer->size);
+}
+
 static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
    /* .free_buffer     = */ ggml_backend_cpu_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_cpu_buffer_get_base,
@ -419,6 +436,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
    /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
    /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
 };

 // for buffers from ptr, free is not called
@ -430,6 +448,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
    /* .get_tensor      = */ ggml_backend_cpu_buffer_get_tensor,
    /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
    /* .cpy_tensor_to   = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+    /* .clear           = */ ggml_backend_cpu_buffer_clear,
 };

 static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
@ -455,20 +474,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
    GGML_UNUSED(buft);
 }

+static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return true;
+
+    GGML_UNUSED(buft);
+}
+
 ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
        /* .iface = */ {
            /* .alloc_buffer     = */ ggml_backend_cpu_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
        },
        /* .context = */ NULL,
    };

-    return &ggml_backend_buffer_type_cpu;
+    return &ggml_backend_cpu_buffer_type;
 }

+#ifdef GGML_USE_CPU_HBM
+
+// buffer type HBM
+
+#include <hbwmalloc.h>
+
+static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    hbw_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    //void * ptr = hbw_malloc(size);
+    void * ptr;
+    int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
+    if (result != 0) {
+        fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
+        return NULL;
+    }
+
+    // FIXME: this is a hack to avoid having to implement a new buffer type
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
+        /* .iface    = */ {
+            /* .alloc_buffer     = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type_get_alignment,
+            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
+            /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type_is_host,
+        },
+        /* .context  = */ NULL,
+    };
+
+    return &ggml_backend_cpu_buffer_type_hbm;
+}
+#endif
+
 struct ggml_backend_cpu_context {
    int n_threads;
    void * work_data;
@ -505,7 +574,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
    struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
-    cpu_plan->cgraph = *cgraph;
+    cpu_plan->cgraph = *cgraph; // FIXME: deep copy

    if (cpu_plan->cplan.work_size > 0) {
        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
@ -531,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
    GGML_UNUSED(backend);
 }

-static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
@ -545,13 +614,18 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
    cplan.work_data = cpu_ctx->work_data;

    ggml_graph_compute(cgraph, &cplan);
+    return true;
 }

 static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
-    return true;
+    switch (op->op) {
+        case GGML_OP_MUL_MAT:
+            return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
+        default:
+            return true;
+    }

    GGML_UNUSED(backend);
-    GGML_UNUSED(op);
 }

 static struct ggml_backend_i cpu_backend_i = {
@ -1180,7 +1254,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
 // utils
 void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
    GGML_ASSERT(tensor->buffer == NULL);
-    GGML_ASSERT(tensor->data == NULL);
+    //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
    GGML_ASSERT(tensor->view_src != NULL);
    GGML_ASSERT(tensor->view_src->buffer != NULL);
    GGML_ASSERT(tensor->view_src->data != NULL);
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -21,6 +21,7 @@ extern "C" {
    GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API bool ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);

    // buffer
    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
@ -29,6 +30,8 @@ extern "C" {
    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool   ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);

    //
@ -55,7 +58,7 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
@ -76,6 +79,10 @@ extern "C" {

    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);

+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
    //
    // Backend registry
    //
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);

 // same as ggml_graph_compute but uses Metal
 // creates gf->n_threads command buffers in parallel
-void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);

 //
 // backend API
@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);

 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

+GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+
 GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);

 // helper to check if the device supports a specific family
--- a/ggml-metal.m
+++ b/ggml-metal.m
@ -87,6 +87,7 @@ struct ggml_metal_context {
    GGML_METAL_DECL_KERNEL(get_rows_q4_K);
    GGML_METAL_DECL_KERNEL(get_rows_q5_K);
    GGML_METAL_DECL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DECL_KERNEL(get_rows_i32);
    GGML_METAL_DECL_KERNEL(rms_norm);
    GGML_METAL_DECL_KERNEL(group_norm);
    GGML_METAL_DECL_KERNEL(norm);
@ -180,7 +181,15 @@ struct ggml_metal_context {
@implementation GGMLMetalClass
@end

-ggml_log_callback ggml_metal_log_callback = NULL;
+
+static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
+    fprintf(stderr, "%s", msg);
+
+    UNUSED(level);
+    UNUSED(user_data);
+}
+
+ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
 void * ggml_metal_log_user_data = NULL;

 void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
@ -251,6 +260,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        NSError * error = nil;
        NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
        if (libPath != nil) {
+            // pre-compiled library found
            NSURL * libURL = [NSURL fileURLWithPath:libPath];
            GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
            ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
@ -283,6 +293,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
            options = [MTLCompileOptions new];
            options.preprocessorMacros = @{ @"QK_K" : @(64) };
 #endif
+            // try to disable fast-math
+            // NOTE: this seems to have no effect whatsoever
+            //       instead, in order to disable fast-math, we have to build default.metallib from the command line
+            //       using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
+            //       and go through the "pre-compiled library found" path above
+            //[options setFastMathEnabled:false];
+
            ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
        }

@ -361,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(get_rows_q4_K);
        GGML_METAL_ADD_KERNEL(get_rows_q5_K);
        GGML_METAL_ADD_KERNEL(get_rows_q6_K);
+        GGML_METAL_ADD_KERNEL(get_rows_i32);
        GGML_METAL_ADD_KERNEL(rms_norm);
        GGML_METAL_ADD_KERNEL(group_norm);
        GGML_METAL_ADD_KERNEL(norm);
@ -483,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
    GGML_METAL_DEL_KERNEL(get_rows_q4_K);
    GGML_METAL_DEL_KERNEL(get_rows_q5_K);
    GGML_METAL_DEL_KERNEL(get_rows_q6_K);
+    GGML_METAL_DEL_KERNEL(get_rows_i32);
    GGML_METAL_DEL_KERNEL(rms_norm);
    GGML_METAL_DEL_KERNEL(group_norm);
    GGML_METAL_DEL_KERNEL(norm);
@ -607,12 +626,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
 }

 // temporarily defined here for compatibility between ggml-backend and the old API
-struct ggml_backend_metal_buffer_context {
-    void * data;
+
+struct ggml_backend_metal_buffer {
+    void   * data;
+    size_t   size;

    id<MTLBuffer> metal;
 };

+struct ggml_backend_metal_buffer_context {
+    void * all_data;
+    size_t all_size;
+    bool owned;
+
+    // multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
+    int n_buffers;
+    struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
+};
+
 // finds the Metal buffer that contains the tensor data on the GPU device
 // the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
 // Metal buffer based on the host memory pointer
@ -622,17 +653,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru

    const int64_t tsize = ggml_nbytes(t);

+    ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
+
    // compatibility with ggml-backend
-    if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
-        struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
+    if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
+        struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;

-        const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
+        // find the view that contains the tensor fully
+        for (int i = 0; i < buf_ctx->n_buffers; ++i) {
+            const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;

-        GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
+            //GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
+            if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
+                *offs = (size_t) ioffs;

-        *offs = (size_t) ioffs;
+                //GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);

-        return buf_ctx->metal;
+                return buf_ctx->buffers[i].metal;
+            }
+        }
+
+        GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
+
+        return nil;
    }

    // find the view that contains the tensor fully
@ -934,7 +977,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
            return false;
    }
 }
-void ggml_metal_graph_compute(
+bool ggml_metal_graph_compute(
        struct ggml_metal_context * ctx,
               struct ggml_cgraph * gf) {
    @autoreleasepool {
@ -1198,7 +1241,7 @@ void ggml_metal_graph_compute(
                                // not sure how to avoid this
                                // TODO: make a simpler cpy_bytes kernel

-                                const int nth = MIN(1024, ne00);
+                                const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);

                                [encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
                                [encoder setBuffer:id_src0 offset:offs_src0        atIndex:0];
@ -1253,7 +1296,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
                            [encoder setBytes:&offs length:sizeof(offs) atIndex:27];

-                            const int nth = MIN(1024, ne0);
+                            const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);

                            [encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@ -1261,7 +1304,7 @@ void ggml_metal_graph_compute(
                        {
                            GGML_ASSERT(ggml_is_contiguous(src0));

-                            const float scale = *(const float *) src1->data;
+                            const float scale = *(const float *) dst->op_params;

                            int64_t n = ggml_nelements(dst);

@ -1272,8 +1315,8 @@ void ggml_metal_graph_compute(
                                [encoder setComputePipelineState:ctx->pipeline_scale];
                            }

-                            [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                            [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+                            [encoder setBuffer:id_src0   offset:offs_src0 atIndex:0];
+                            [encoder setBuffer:id_dst    offset:offs_dst  atIndex:1];
                            [encoder setBytes:&scale length:sizeof(scale) atIndex:2];

                            [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
@ -1617,6 +1660,10 @@ void ggml_metal_graph_compute(
                                        }
                                };

+                                if (ggml_is_quantized(src0t)) {
+                                    GGML_ASSERT(ne00 >= nth0*nth1);
+                                }
+
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@ -1675,6 +1722,9 @@ void ggml_metal_graph_compute(
                            // TODO: make this more general
                            GGML_ASSERT(n_as <= 8);

+                            // max size of the src1ids array in the kernel stack
+                            GGML_ASSERT(ne11 <= 512);
+
                            struct ggml_tensor * src2 = gf->nodes[i]->src[2];

                            const int64_t  ne20 = src2 ? src2->ne[0] : 0;
@ -1692,9 +1742,6 @@ void ggml_metal_graph_compute(
                            GGML_ASSERT(!ggml_is_transposed(src2));
                            GGML_ASSERT(!ggml_is_transposed(src1));

-                            GGML_ASSERT(ne20 % 32 == 0);
-                            // !!!!!!!!! TODO: this assert is probably required but not sure!
-                            //GGML_ASSERT(ne20 >= 64);
                            GGML_ASSERT(src1t == GGML_TYPE_F32);

                            const uint r2 = ne12/ne22;
@ -1702,22 +1749,22 @@ void ggml_metal_graph_compute(

                            // find the break-even point where the matrix-matrix kernel becomes more efficient compared
                            // to the matrix-vector kernel
-                            int ne11_mm_min = 1;
+                            int ne11_mm_min = n_as;

                            const int idx = ((int32_t *) dst->op_params)[0];

                            // batch size
                            GGML_ASSERT(ne01 == ne11);

-                            const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
-
                            // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
                            // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
                            // !!!
                            // TODO: for now, always use mat-vec kernels until we figure out how to improve the
                            //       indirect matrix multiplication
                            // !!!
-                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
+                            if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
+                                ne20 % 32 == 0 && ne20 >= 64 &&
+                                ne11 > ne11_mm_min) {
                                switch (src2->type) {
                                    case GGML_TYPE_F32:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32];  break;
                                    case GGML_TYPE_F16:  [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32];  break;
@ -1747,14 +1794,15 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&nb11    length:sizeof(nb11) atIndex:11];
                                [encoder setBytes:&nb12    length:sizeof(nb12) atIndex:12];
                                [encoder setBytes:&ne0     length:sizeof(ne0)  atIndex:13];
-                                [encoder setBytes:&_ne1    length:sizeof(_ne1) atIndex:14];
+                                [encoder setBytes:&ne1     length:sizeof(ne1)  atIndex:14];
                                [encoder setBytes:&nb1     length:sizeof(nb1)  atIndex:15];
                                [encoder setBytes:&r2      length:sizeof(r2)   atIndex:16];
                                [encoder setBytes:&r3      length:sizeof(r3)   atIndex:17];
                                [encoder setBytes:&idx     length:sizeof(idx)  atIndex:18];
                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < n_as; ++j) {
-                                    struct ggml_tensor * src_cur = dst->src[2 + j];
+                                for (int j = 0; j < 8; ++j) {
+                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
+                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

                                    size_t offs_src_cur = 0;
                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@ -1764,8 +1812,7 @@ void ggml_metal_graph_compute(

                                [encoder setThreadgroupMemoryLength:8192 atIndex:0];

-                                // TODO: processing one row at a time (ne11 -> 1) is not efficient
-                                [encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
+                                [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
                            } else {
                                int nth0 = 32;
                                int nth1 = 1;
@ -1848,11 +1895,17 @@ void ggml_metal_graph_compute(
                                        } break;
                                    default:
                                        {
-                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
+                                            GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
                                            GGML_ASSERT(false && "not implemented");
                                        }
                                };

+                                if (ggml_is_quantized(src2t)) {
+                                    GGML_ASSERT(ne20 >= nth0*nth1);
+                                }
+
+                                const int64_t _ne1 = 1; // kernels needs a reference in constant memory
+
                                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                                [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
@ -1877,8 +1930,9 @@ void ggml_metal_graph_compute(
                                [encoder setBytes:&r3   length:sizeof(r3)   atIndex:21];
                                [encoder setBytes:&idx  length:sizeof(idx)  atIndex:22];
                                // TODO: how to make this an array? read Metal docs
-                                for (int j = 0; j < n_as; ++j) {
-                                    struct ggml_tensor * src_cur = dst->src[2 + j];
+                                for (int j = 0; j < 8; ++j) {
+                                    // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
+                                    struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];

                                    size_t offs_src_cur = 0;
                                    id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
@ -1927,6 +1981,7 @@ void ggml_metal_graph_compute(
                                case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
                                case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
                                case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
+                                case GGML_TYPE_I32:  [encoder setComputePipelineState:ctx->pipeline_get_rows_i32];  break;
                                default: GGML_ASSERT(false && "not implemented");
                            }

@ -2197,7 +2252,7 @@ void ggml_metal_graph_compute(
                            [encoder setBytes:&nb3  length:sizeof(nb3)  atIndex:17];
                            [encoder setBytes:&sf   length:sizeof(sf)   atIndex:18];

-                            const int nth = MIN(1024, ne0);
+                            const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);

                            [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                        } break;
@ -2350,10 +2405,11 @@ void ggml_metal_graph_compute(
        MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
        if (status != MTLCommandBufferStatusCompleted) {
            GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
-            GGML_ASSERT(false);
+            return false;
        }
    }

+    return true;
    }
 }

@ -2361,6 +2417,7 @@ void ggml_metal_graph_compute(

 // backend interface

+// default buffer
 static id<MTLDevice> g_backend_device = nil;
 static int g_backend_device_ref_count = 0;

@ -2388,34 +2445,31 @@ static void ggml_backend_metal_free_device(void) {
 static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;

-    return ctx->data;
+    return ctx->all_data;
 }

 static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;

-    [ctx->metal release];
+    for (int i = 0; i < ctx->n_buffers; i++) {
+        [ctx->buffers[i].metal release];
+    }
    ggml_backend_metal_free_device();

-    free(ctx->data);
-    free(ctx);
+    if (ctx->owned) {
+        free(ctx->all_data);
+    }

-    UNUSED(buffer);
+    free(ctx);
 }

 static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
    memcpy((char *)tensor->data + offset, data, size);

    UNUSED(buffer);
 }

 static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-
    memcpy(data, (const char *)tensor->data + offset, size);

    UNUSED(buffer);
@ -2433,7 +2487,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
    UNUSED(buffer);
 }

-static struct ggml_backend_buffer_i metal_backend_buffer_i = {
+static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
+
+    memset(ctx->all_data, value, ctx->all_size);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
    /* .free_buffer     = */ ggml_backend_metal_buffer_free_buffer,
    /* .get_base        = */ ggml_backend_metal_buffer_get_base,
    /* .init_tensor     = */ NULL,
@ -2441,8 +2501,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
    /* .get_tensor      = */ ggml_backend_metal_buffer_get_tensor,
    /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
    /* .cpy_tensor_to   = */ ggml_backend_metal_buffer_cpy_tensor_to,
+    /* .clear           = */ ggml_backend_metal_buffer_clear,
 };

+// default buffer type
+
 static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));

@ -2453,13 +2516,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
        size_aligned += (size_page - (size_aligned % size_page));
    }

-    ctx->data  = ggml_metal_host_malloc(size);
-    ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
+    id<MTLDevice> device = ggml_backend_metal_get_device();
+
+    ctx->all_data = ggml_metal_host_malloc(size_aligned);
+    ctx->all_size = size_aligned;
+    ctx->owned = true;
+    ctx->n_buffers = 1;
+
+    ctx->buffers[0].data = ctx->all_data;
+    ctx->buffers[0].size = size;
+    ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
                    length:size_aligned
                    options:MTLResourceStorageModeShared
                    deallocator:nil];

-    return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
+    if (ctx->buffers[0].metal == nil) {
+        GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+        free(ctx);
+        ggml_backend_metal_free_device();
+        return NULL;
+    }
+
+    GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
+
+
+#if TARGET_OS_OSX
+    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+            device.currentAllocatedSize / 1024.0 / 1024.0,
+            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+    } else {
+        GGML_METAL_LOG_INFO("\n");
+    }
+#else
+    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
+#endif
+
+
+    return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
 }

 static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@ -2470,7 +2566,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
 static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
    return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);

-    GGML_UNUSED(buft);
+    UNUSED(buft);
+}
+
+static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+    return true;
+
+    UNUSED(buft);
 }

 ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@ -2480,6 +2582,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
            /* .get_alignment    = */ ggml_backend_metal_buffer_type_get_alignment,
            /* .get_alloc_size   = */ NULL, // defaults to ggml_nbytes
            /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
+            /* .is_host          = */ ggml_backend_metal_buffer_type_is_host,
        },
        /* .context = */ NULL,
    };
@ -2487,6 +2590,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
    return &ggml_backend_buffer_type_metal;
 }

+// buffer from ptr
+
+ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
+    struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
+
+    ctx->all_data = data;
+    ctx->all_size = size;
+    ctx->owned = false;
+    ctx->n_buffers = 0;
+
+    const size_t size_page = sysconf(_SC_PAGESIZE);
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+
+    id<MTLDevice> device = ggml_backend_metal_get_device();
+
+    // the buffer fits into the max buffer size allowed by the device
+    if (size_aligned <= device.maxBufferLength) {
+        ctx->buffers[ctx->n_buffers].data = data;
+        ctx->buffers[ctx->n_buffers].size = size;
+
+        ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+        if (ctx->buffers[ctx->n_buffers].metal == nil) {
+            GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
+            return false;
+        }
+
+        GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
+
+        ++ctx->n_buffers;
+    } else {
+        // this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
+        // one of the views
+        const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
+        const size_t size_step = device.maxBufferLength - size_ovlp;
+        const size_t size_view = device.maxBufferLength;
+
+        for (size_t i = 0; i < size; i += size_step) {
+            const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
+
+            ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
+            ctx->buffers[ctx->n_buffers].size = size_step_aligned;
+
+            ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
+
+            if (ctx->buffers[ctx->n_buffers].metal == nil) {
+                GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
+                return false;
+            }
+
+            GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
+            if (i + size_step < size) {
+                GGML_METAL_LOG_INFO("\n");
+            }
+
+            ++ctx->n_buffers;
+        }
+    }
+
+#if TARGET_OS_OSX
+    GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
+            device.currentAllocatedSize / 1024.0 / 1024.0,
+            device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
+
+    if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
+        GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
+    } else {
+        GGML_METAL_LOG_INFO("\n");
+    }
+#else
+    GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
+#endif
+
+    return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
+}
+
+// backend
+
 static const char * ggml_backend_metal_name(ggml_backend_t backend) {
    return "Metal";

@ -2499,20 +2683,16 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
    free(backend);
 }

-static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
-    UNUSED(backend);
-}
-
 static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
    return ggml_backend_metal_buffer_type();

    UNUSED(backend);
 }

-static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;

-    ggml_metal_graph_compute(metal_ctx, cgraph);
+    return ggml_metal_graph_compute(metal_ctx, cgraph);
 }

 static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
@ -2529,25 +2709,15 @@ static struct ggml_backend_i metal_backend_i = {
    /* .get_tensor_async        = */ NULL,
    /* .cpy_tensor_from_async   = */ NULL,
    /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ ggml_backend_metal_synchronize,
-    /* .graph_plan_create       = */ NULL, // the metal implementation does not require creating graph plans atm
+    /* .synchronize             = */ NULL,
+    /* .graph_plan_create       = */ NULL,
    /* .graph_plan_free         = */ NULL,
    /* .graph_plan_compute      = */ NULL,
    /* .graph_compute           = */ ggml_backend_metal_graph_compute,
    /* .supports_op             = */ ggml_backend_metal_supports_op,
 };

-// TODO: make a common log callback for all backends in ggml-backend
-static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
-    fprintf(stderr, "%s", msg);
-
-    UNUSED(level);
-    UNUSED(user_data);
-}
-
 ggml_backend_t ggml_backend_metal_init(void) {
-    ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
-
    struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);

    if (ctx == NULL) {
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -6,19 +6,19 @@
 extern "C" {
 #endif

-void ggml_cl_init(void);
+GGML_API void ggml_cl_init(void);

-void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

-void * ggml_cl_host_malloc(size_t size);
-void   ggml_cl_host_free(void * ptr);
+GGML_API void * ggml_cl_host_malloc(size_t size);
+GGML_API void   ggml_cl_host_free(void * ptr);

-void ggml_cl_free_data(const struct ggml_tensor* tensor);
+GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);

-void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);

 #ifdef  __cplusplus
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
@ -407,6 +407,22 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
 #define ggml_vld1q_s8_x4  vld1q_s8_x4

 #endif
+
+#if !defined(__ARM_FEATURE_DOTPROD)
+
+inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
+    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
+    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
+
+    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
+}
+
+#else
+
+#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
+
+#endif
+
 #endif

 #if defined(__ARM_NEON) || defined(__wasm_simd128__)
@ -2468,32 +2484,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
        const int8x16_t v1_1l = vld1q_s8(y1->qs);
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

-#if defined(__ARM_FEATURE_DOTPROD)
        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#endif
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -2776,32 +2772,12 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1l = vld1q_s8(y1->qs);
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

-#if defined(__ARM_FEATURE_DOTPROD)
        // dot product into int32x4_t
-        const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
-        const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
+        const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
+        const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);

        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#endif
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
@ -2963,32 +2939,12 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1l = vld1q_s8(y1->qs);
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

-#if defined(__ARM_FEATURE_DOTPROD)
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#endif
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3275,32 +3231,12 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
        const int8x16_t v1_1l = vld1q_s8(y1->qs);
        const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);

-#if defined(__ARM_FEATURE_DOTPROD)
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
-                        vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
-                        vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#else
-        const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
-        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
-        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
-        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
-
-        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
-        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
-        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
-        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
-
-        const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
-        const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
-        const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
-        const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
-#endif
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
+                        ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
@ -3550,34 +3486,13 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
        const int8x16_t y1_0 = vld1q_s8(y1->qs);
        const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);

-#if defined(__ARM_FEATURE_DOTPROD)
        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
-                        vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));

        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
-                        vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
-                        vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-
-#else
-        const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
-        const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
-        const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
-        const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
-
-        const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
-        const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
-        const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
-        const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
-
-        const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
-        const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
-        const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
-        const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
-
-        sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
-        sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
-#endif
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
+                        ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
    }

    *s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
@ -3650,12 +3565,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
    const int nb = n / QK_K;

 #ifdef __ARM_NEON
-
    const uint8x16_t m3 = vdupq_n_u8(0x3);
    const uint8x16_t m4 = vdupq_n_u8(0xF);
-#if defined(__ARM_FEATURE_DOTPROD)
-    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif
+
+    const int32x4_t vzero = vdupq_n_s32(0);

    ggml_int8x16x2_t q2bytes;
    uint8_t aux[16];
@ -3663,7 +3576,6 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
    float sum = 0;

    for (int i = 0; i < nb; ++i) {
-
        const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
        const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);

@ -3677,7 +3589,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri

        const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
-        const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
+        const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
        const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
                                       vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
        const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
@ -3689,20 +3601,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri

 // We use this macro instead of a function call because for some reason
 // the code runs 2-3% slower, even if the function is declared inline
-#if defined(__ARM_FEATURE_DOTPROD)
 #define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
-        isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
-#else
-#define MULTIPLY_ACCUM_WITH_SCALE(index)\
-        {\
-    const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\
-                                   vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\
-    const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\
-                                   vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\
-    isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\
-        }
-#endif
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];

 #define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
        q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
@ -3710,26 +3611,23 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
        MULTIPLY_ACCUM_WITH_SCALE((index));

-
        for (int j = 0; j < QK_K/128; ++j) {
-
            const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;

            ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
            q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
            q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
+
            MULTIPLY_ACCUM_WITH_SCALE(0);

            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
-
            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
-
            SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);

            is += 8;
        }
-        sum += d * isum;

+        sum += d * isum;
    }

    *s = sum;
@ -4043,11 +3941,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
    const int nb = n / QK_K;

 #ifdef __ARM_NEON
-
    const uint8x16_t m3 = vdupq_n_u8(0x3);
-#if defined(__ARM_FEATURE_DOTPROD)
-    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif
+
+    const int32x4_t vzero = vdupq_n_s32(0);

    ggml_int8x16x4_t q2bytes;

@ -4081,28 +3977,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
        q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
        q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));

-#if defined(__ARM_FEATURE_DOTPROD)
-        isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
-        isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
-        isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
-#else
-        const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                       vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-        const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                       vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-        isum1 += vaddvq_s16(p1) * scales[0];
-        isum2 += vaddvq_s16(p2) * scales[1];
+        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
+        isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
+        isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];

-        const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
-                                       vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2])));
-        const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
-                                       vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3])));
-        isum1 += vaddvq_s16(p3) * scales[2];
-        isum2 += vaddvq_s16(p4) * scales[3];
-#endif
        sum += d * (isum1 + isum2);
-
    }

    *s = sum;
@ -4328,9 +4208,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
    uint32_t utmp[4];

    const uint8x16_t m3b = vdupq_n_u8(0x3);
-#ifdef __ARM_FEATURE_DOTPROD
    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif

    const uint8x16_t m0 = vdupq_n_u8(1);
    const uint8x16_t m1 = vshlq_n_u8(m0, 1);
@ -4382,22 +4260,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));

-#if defined(__ARM_FEATURE_DOTPROD)
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
-#else
-            int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])),
-                                     vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0])));
-            int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])),
-                                     vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1])));
-            int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])),
-                                     vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2])));
-            int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])),
-                                     vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3])));
-            isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
-#endif
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
+
            scale += 4;

            q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
@ -4410,22 +4277,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
            q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
            q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));

-#if defined(__ARM_FEATURE_DOTPROD)
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
-            isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
-#else
-            p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])),
-                           vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0])));
-            p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])),
-                           vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1])));
-            p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])),
-                           vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2])));
-            p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])),
-                           vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3])));
-            isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
-#endif
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
+
            scale += 4;

            if (j == 0) {
@ -4864,10 +4720,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
    const int nb = n / QK_K;

 #ifdef __ARM_NEON
-
-#ifdef __ARM_FEATURE_DOTPROD
-    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif
+    const int32x4_t vzero = vdupq_n_s32(0);

    const uint8x16_t m3b = vdupq_n_u8(0x3);
    const uint8x16_t mh  = vdupq_n_u8(4);
@ -4908,22 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
        q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
        q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6),                q3h.val[3]));

-#if defined(__ARM_FEATURE_DOTPROD)
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
-        isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
-#else
-        const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                       vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-        const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                       vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-        const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
-                                       vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2])));
-        const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
-                                       vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3])));
-        isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3];
-#endif
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];

        sum += d * isum;

@ -5228,11 +5069,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
    uint32_t utmp[4];

 #ifdef __ARM_NEON
-
    const uint8x16_t m4b = vdupq_n_u8(0xf);
-#ifdef __ARM_FEATURE_DOTPROD
    const int32x4_t mzero = vdupq_n_s32(0);
-#endif

    ggml_int8x16x2_t q4bytes;
    ggml_int8x16x2_t q8bytes;
@ -5269,44 +5107,22 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
        int32_t sumi2 = 0;

        for (int j = 0; j < QK_K/64; ++j) {
-
            const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;

-#ifdef __ARM_FEATURE_DOTPROD
            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));

-            const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
            sumi1 += vaddvq_s32(p1) * scales[2*j+0];

            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));

-            const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+            const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);

            sumi2 += vaddvq_s32(p2) * scales[2*j+1];
-#else
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-            const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                           vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-            const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                           vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-            sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
-
-            q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
-            q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-            q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-            const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                           vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-            const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                           vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-            sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1];
-
-#endif
        }

        sumf += d * (sumi1 + sumi2);
@ -5603,12 +5419,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
    const int nb = n / QK_K;

 #ifdef __ARM_NEON
-
    const uint8x16_t m4b = vdupq_n_u8(0xf);

-#ifdef __ARM_FEATURE_DOTPROD
    const int32x4_t mzero = vdupq_n_s32(0);
-#endif

    float sumf = 0;

@ -5636,41 +5449,20 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri

        const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);

-#ifdef __ARM_FEATURE_DOTPROD
        q8bytes = ggml_vld1q_s8_x4(q8);
        q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
        q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));

-        const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
+        const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
        const int32_t sumi1 = vaddvq_s32(p1) * scales[0];

        q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
        q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));

-        const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
+        const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
        const int32_t sumi2 = vaddvq_s32(p2) * scales[1];

-#else
-        q8bytes = ggml_vld1q_s8_x4(q8);
-        q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[0], m4b));
-        q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8  (q4bits.val[1], m4b));
-        const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                       vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-        const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                       vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-        int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0];
-
-        q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
-        q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
-        const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])),
-                                       vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2])));
-        const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])),
-                                       vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3])));
-        int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1];
-
-#endif
        sumf += d * (sumi1 + sumi2);
-
    }

    *s = sumf - sum_mins;
@ -5875,15 +5667,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri

    uint32_t utmp[4];

-
 #ifdef __ARM_NEON
-
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    const uint8x16_t mone = vdupq_n_u8(1);
    const uint8x16_t mtwo = vdupq_n_u8(2);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t mzero = vdupq_n_s32(0);
-#endif

    ggml_int8x16x4_t q5bytes;

@ -5938,28 +5726,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
            q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
            q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));

-#if defined(__ARM_FEATURE_DOTPROD)
-
-            sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
-            sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
-#else
-
-            const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                           vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-            const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                           vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-            sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++;
-
-            const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
-                                           vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
-            const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
-                                           vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
-            sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++;
-#endif
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
+            sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
        }

        sumf += d * sumi - dmin * sumi_mins;
-
    }

    *s = sumf;
@ -6311,12 +6082,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
    const int nb = n / QK_K;

 #ifdef __ARM_NEON
-
    const uint8x16_t m4b = vdupq_n_u8(0xf);
    const uint8x16_t mh = vdupq_n_u8(16);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t mzero = vdupq_n_s32(0);
-#endif

    ggml_int8x16x4_t q5bytes;
    ggml_uint8x16x4_t q5h;
@ -6348,32 +6116,12 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
        q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
        q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));

-#if defined(__ARM_FEATURE_DOTPROD)
-
-        int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
-        int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
-        int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
-        int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
+        int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
+        int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
+        int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
+        int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));

        sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
-
-#else
-
-        const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                       vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-        const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                       vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-        int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1);
-
-        const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
-                                       vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
-        const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
-                                       vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
-        sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3);
-
-        sumf += d*sumi;
-#endif
-
    }

    *s = sumf;
@ -6600,13 +6348,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    const int nb = n / QK_K;

 #ifdef __ARM_NEON
-
    float sum = 0;

    const uint8x16_t m4b = vdupq_n_u8(0xF);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif
    //const int8x16_t  m32s = vdupq_n_s8(32);

    const uint8x16_t mone = vdupq_n_u8(3);
@ -6626,7 +6371,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri

        const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
        const int8x16_t scales = vld1q_s8(scale);
-        const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
+        const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};

        const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
                                                   vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
@ -6658,31 +6403,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));

-#if defined(__ARM_FEATURE_DOTPROD)
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];

-            isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
            scale += 4;

-#else
-
-            int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                     vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-            int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                     vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-            isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
-            scale += 2;
-
-            int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
-                                     vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
-            int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
-                                     vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
-            isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
-            scale += 2;
-#endif
-
            q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;

            shifted = vshrq_n_u8(qhbits.val[0], 4);
@ -6703,34 +6430,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
            q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
            q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));

-#if defined(__ARM_FEATURE_DOTPROD)
-
-            isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                    vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
+            isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                    vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
            scale += 4;
-
-            //for (int l = 0; l < 4; ++l) {
-            //    const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]);
-            //    isum += vaddvq_s32(p) * *scale++;
-            //}
-#else
-            p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                    vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-            p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                    vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-            isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
-            scale += 2;
-
-            p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
-                                    vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
-            p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
-                                    vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
-            isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
-            scale += 2;
-#endif
-
        }
        //sum += isum * d_all * y[i].d;
        sum += d_all * y[i].d * (isum - 32 * isum_mins);
@ -7076,14 +6780,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
    const int nb = n / QK_K;

 #ifdef __ARM_NEON
-
    float sum = 0;

    const uint8x16_t m4b = vdupq_n_u8(0xF);
    const int8x16_t  m32s = vdupq_n_s8(32);
-#if defined(__ARM_FEATURE_DOTPROD)
    const int32x4_t  vzero = vdupq_n_s32(0);
-#endif

    const uint8x16_t mone = vdupq_n_u8(3);

@ -7119,26 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
        q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
        q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);

-#if defined(__ARM_FEATURE_DOTPROD)
-
-        isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
-                vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
-#else
-
-        int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
-                                 vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
-        int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
-                                 vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
-        isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
-
-        int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
-                                 vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
-        int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
-                                 vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
-        isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
-#endif
+        isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
+                vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];

        sum += isum * d_all * y[i].d;

--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
-// Effectively 2.5625 bits per weight
+// Effectively 2.625 bits per weight
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -255,6 +255,8 @@
 #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
 #elif defined(__GNUC__)
 #define GGML_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define GGML_UNREACHABLE() __assume(0)
 #else
 #define GGML_UNREACHABLE() ((void) 0)
 #endif
@ -303,7 +305,7 @@ extern "C" {

 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@ -343,6 +345,12 @@ extern "C" {
        GGML_TYPE_COUNT,
    };

+    // precision
+    enum ggml_prec {
+        GGML_PREC_DEFAULT,
+        GGML_PREC_F32,
+    };
+
    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
@ -478,7 +486,8 @@ extern "C" {
    enum ggml_log_level {
        GGML_LOG_LEVEL_ERROR = 2,
        GGML_LOG_LEVEL_WARN = 3,
-        GGML_LOG_LEVEL_INFO = 4
+        GGML_LOG_LEVEL_INFO = 4,
+        GGML_LOG_LEVEL_DEBUG = 5
    };

    // ggml object
@ -502,7 +511,6 @@ extern "C" {

        struct ggml_backend_buffer * buffer;

-        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                                   // nb[0] = ggml_type_size(type)
@ -534,7 +542,7 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[12];
+        char padding[8];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -639,11 +647,14 @@ extern "C" {
    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

-    GGML_API int     ggml_blck_size (enum ggml_type type);
-    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+    GGML_API int    ggml_blck_size(enum ggml_type type);
+    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+
+    GGML_DEPRECATED(
+    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+    "use ggml_row_size() instead");

    GGML_API const char * ggml_type_name(enum ggml_type type);
    GGML_API const char * ggml_op_name  (enum ggml_op   op);
@ -662,6 +673,11 @@ extern "C" {
    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars

    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);

@ -722,8 +738,8 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);

    // Context tensor enumeration and lookup
-    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
-    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
+    GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@ -1050,6 +1066,12 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // change the precision of a matrix multiplication
+    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
+    GGML_API void ggml_mul_mat_set_prec(
+            struct ggml_tensor * a,
+            enum ggml_prec       prec);
+
    // indirect matrix multiplication
    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
    GGML_API struct ggml_tensor * ggml_mul_mat_id(
@ -1075,13 +1097,13 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            float                 s);

    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_scale_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
+            float                 s);

    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set(
@ -2116,10 +2138,11 @@ extern "C" {
    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

-    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
+    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);

    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
@ -2175,6 +2198,7 @@ extern "C" {
    //

    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx_vnni   (void);
    GGML_API int ggml_cpu_has_avx2       (void);
    GGML_API int ggml_cpu_has_avx512     (void);
    GGML_API int ggml_cpu_has_avx512_vbmi(void);
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -143,20 +143,7 @@ class AudioEncoderANE(AudioEncoder):
            x = block(x)

        x = self.ln_post(x)
-
-        # """
-        # TODO:
-        # I think we need to transpose the result here to make it fit whisper.cpp memory order.
-        # However, even doing this, the results are still wrong. Kind of less wrong compared to
-        # not transposing, but still wrong.
-
-        # Also, I don't know why the original OpenAI implementation does not need to transpose
-
-        # transpose to (batch_size, n_ctx, n_state)
-        # x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
-
-        # """
-        # x = x.transpose(1,3)
+        x = x.squeeze(2).transpose(1, 2)

        return x

--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -19,7 +19,7 @@ function get_script_path() {
    fi
 }

-models_path="$(get_script_path)"
+models_path="${2:-$(get_script_path)}"

 # Whisper models
 models=(
@ -43,7 +43,7 @@ models=(
    "large-v1"
    "large-v2"
    "large-v3"
-    "large-q5_0"
+    "large-v3-q5_0"
 )

 # list available models
@ -56,8 +56,8 @@ function list_models {
    printf "\n\n"
 }

-if [ "$#" -ne 1 ]; then
-    printf "Usage: $0 <model>\n"
+if [ "$#" -lt 1 ] || [ "$#" -gt 2 ]; then
+    printf "Usage: $0 <model> [models_path]\n"
    list_models

    exit 1
@ -105,7 +105,7 @@ if [ $? -ne 0 ]; then
    exit 1
 fi

-printf "Done! Model '$model' saved in 'models/ggml-$model.bin'\n"
+printf "Done! Model '$model' saved in '$models_path/ggml-$model.bin'\n"
 printf "You can now use it like this:\n\n"
-printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
+printf "  $ ./main -m $models_path/ggml-$model.bin -f samples/jfk.wav\n"
 printf "\n"
--- a/models/generate-coreml-model.sh
+++ b/models/generate-coreml-model.sh
@ -23,7 +23,7 @@ if [[ $mname == "-h5" ]]; then
  echo $mpath
  python3 models/convert-h5-to-coreml.py --model-name $mname --model-path $mpath --encoder-only True
 else
-  python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+  python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True  --optimize-ane True
 fi

 xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
--- a/openvino/whisper-openvino-encoder.cpp
+++ b/openvino/whisper-openvino-encoder.cpp
@ -64,15 +64,15 @@ int whisper_openvino_encode(
        return 0;
    }

-    if (mel->n_dims != 2) {
+    if (ggml_n_dims(mel) != 2) {
        fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
-            __func__, mel->n_dims);
+            __func__, ggml_n_dims(mel));
        return 0;
    }

-    if (out->n_dims != 2) {
+    if (ggml_n_dims(out) != 2) {
        fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
-            __func__, out->n_dims);
+            __func__, ggml_n_dims(out));
        return 0;
    }

@ -105,4 +105,4 @@ int whisper_openvino_encode(
    }

    return 1;
-}
+}
--- a/whisper.cpp
+++ b/whisper.cpp
@ -122,9 +122,18 @@ WHISPER_ATTRIBUTE_FORMAT(2, 3)
 static void whisper_log_internal        (ggml_log_level level, const char * format, ...);
 static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data);

-#define WHISPER_LOG_INFO(...)  whisper_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
-#define WHISPER_LOG_WARN(...)  whisper_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define WHISPER_LOG_ERROR(...) whisper_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
+#define WHISPER_LOG_WARN(...)  whisper_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
+#define WHISPER_LOG_INFO(...)  whisper_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
+
+// define this to enable verbose trace logging - useful for debugging purposes
+// #define WHISPER_DEBUG
+
+#if defined(WHISPER_DEBUG)
+#define WHISPER_LOG_DEBUG(...) whisper_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
+#else
+#define WHISPER_LOG_DEBUG(...)
+#endif

 #define WHISPER_ASSERT(x) \
    do { \
@ -134,18 +143,6 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
        } \
    } while (0)

-// define this to enable verbose trace logging - useful for debugging purposes
-//#define WHISPER_DEBUG
-
-#if defined(WHISPER_DEBUG)
-#define WHISPER_PRINT_DEBUG(...) \
-    do { \
-        fprintf(stderr, __VA_ARGS__); \
-    } while (0)
-#else
-#define WHISPER_PRINT_DEBUG(...)
-#endif
-
 //#define WHISPER_USE_FLASH_ATTN
 //#define WHISPER_USE_FLASH_FF
 #define WHISPER_MAX_DECODERS 8
@ -155,7 +152,7 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
 // ggml helpers
 //

-static void ggml_graph_compute_helper(
+static bool ggml_graph_compute_helper(
          struct ggml_cgraph * graph,
        std::vector<uint8_t> & buf,
                         int   n_threads,
@ -171,10 +168,10 @@ static void ggml_graph_compute_helper(
        plan.work_data = buf.data();
    }

-    ggml_graph_compute(graph, &plan);
+    return ggml_graph_compute(graph, &plan);
 }

-static void ggml_graph_compute_helper(
+static bool ggml_graph_compute_helper(
       struct ggml_backend * backend,
        struct ggml_cgraph * graph,
                       int   n_threads) {
@ -186,7 +183,7 @@ static void ggml_graph_compute_helper(
        ggml_backend_metal_set_n_cb(backend, n_threads);
    }
 #endif
-    ggml_backend_graph_compute(backend, graph);
+    return ggml_backend_graph_compute(backend, graph);
 }

 // faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
@ -487,8 +484,8 @@ static size_t whisper_allocr_size(struct whisper_allocr & allocr) {

 // measure the memory usage of a graph and prepare the allocr's internal data buffer
 static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
-    auto & alloc  = allocr.alloc;
-    auto & meta   = allocr.meta;
+    auto & alloc = allocr.alloc;
+    auto & meta  = allocr.meta;

    alloc = ggml_allocr_new_measure_from_backend(backend);

@ -1777,7 +1774,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(

    ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);

-    ggml_allocr * alloc = wstate.alloc_encode.alloc;
+    //ggml_allocr * alloc = wstate.alloc_encode.alloc;

    //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
    //ggml_allocr_alloc(alloc, cur);
@ -1787,13 +1784,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
    //}
    struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);

-    struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(alloc, KQscale);
-
-    if (!ggml_allocr_is_measure(alloc)) {
-        const float val = 1.0f/sqrtf(float(n_state)/n_head);
-        ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
-    }
+    const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);

    // ===================================================================
    // NOTE: experimenting with partial evaluation of the encoder (ignore)
@ -1843,14 +1834,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(

            Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);

-            //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            //Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));

            // note: no bias for Key
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                    layer.attn_k_w,
                    cur);

-            //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            //Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));

            struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                    layer.attn_v_w,
@ -2032,7 +2023,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(

    ggml_cgraph * gf = ggml_new_graph(ctx0);

-    ggml_allocr * alloc = wstate.alloc_cross.alloc;
+    //ggml_allocr * alloc = wstate.alloc_cross.alloc;

    //struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
    //ggml_allocr_alloc(alloc, cur);
@ -2042,13 +2033,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
    //}
    struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);

-    struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(alloc, Kscale);
-
-    if (!ggml_allocr_is_measure(alloc)) {
-        const float val = pow(float(n_state) / n_head, -0.25);
-        ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float));
-    }
+    const float  Kscale = pow(float(n_state) / n_head, -0.25);

    for (int il = 0; il < model.hparams.n_text_layer; ++il) {
        auto & layer = model.layers_decoder[il];
@ -2118,7 +2103,9 @@ static bool whisper_encode_internal(
        ggml_allocr_alloc_graph(alloc, gf);

        if (!whisper_encode_external(wstate)) {
-            ggml_graph_compute_helper(wstate.backend, gf, n_threads);
+            if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
+                return false;
+            }
        }
    }

@ -2132,7 +2119,9 @@ static bool whisper_encode_internal(

        ggml_allocr_alloc_graph(alloc, gf);

-        ggml_graph_compute_helper(wstate.backend, gf, n_threads);
+        if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
+            return false;
+        }
    }

    // cross
@ -2145,7 +2134,9 @@ static bool whisper_encode_internal(

        ggml_allocr_alloc_graph(alloc, gf);

-        ggml_graph_compute_helper(wstate.backend, gf, n_threads);
+        if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
+            return false;
+        }
    }

    wstate.t_encode_us += ggml_time_us() - t_start_us;
@ -2178,7 +2169,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
    const int32_t n_kv     = ggml_allocr_is_measure(alloc) ? n_ctx            : kv_self.n;
    const int32_t kv_head  = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;

-    //WHISPER_PRINT_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
+    //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);

    struct ggml_init_params params = {
        /*.mem_size   =*/ wstate.alloc_decode.meta.size(),
@ -2207,13 +2198,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
        }
    }

-    struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-    ggml_allocr_alloc(alloc, KQscale);
-
-    if (!ggml_allocr_is_measure(alloc)) {
-        const float val = pow(float(n_state)/n_head, -0.25);
-        ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
-    }
+    const float KQscale = pow(float(n_state)/n_head, -0.25);

    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
    ggml_allocr_alloc(alloc, KQ_mask);
@ -2573,7 +2558,9 @@ static bool whisper_decode_internal(

        logits = gf->nodes[gf->n_nodes - 1];

-        ggml_graph_compute_helper(wstate.backend, gf, n_threads);
+        if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
+            return false;
+        }
    }

    logits_out.resize(n_tokens*n_vocab);
@ -3834,6 +3821,7 @@ void whisper_reset_timings(struct whisper_context * ctx) {
        ctx->state->t_sample_us = 0;
        ctx->state->t_encode_us = 0;
        ctx->state->t_decode_us = 0;
+        ctx->state->t_batchd_us = 0;
        ctx->state->t_prompt_us = 0;
        ctx->state->n_sample = 0;
        ctx->state->n_encode = 0;
@ -4962,7 +4950,7 @@ static void whisper_sequence_score(
            const auto p = kv.second/(double)cnt;
            entropy -= p*log(p);

-            //WHISPER_PRINT_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
+            //WHISPER_LOG_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
        }

        sequence.entropy = entropy;
@ -5028,7 +5016,7 @@ int whisper_full_with_state(
    // basically don't process anything that is less than 1.0s
    // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
    if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
-        WHISPER_PRINT_DEBUG("%s: input is too short - %d ms < 1000 ms\n", __func__, (seek_end - seek_start)*10);
+        WHISPER_LOG_DEBUG("%s: input is too short - %d ms < 1000 ms\n", __func__, (seek_end - seek_start)*10);
        return 0;
    }

@ -5217,7 +5205,7 @@ int whisper_full_with_state(

            n_decoders_cur = std::max(1, n_decoders_cur);

-            WHISPER_PRINT_DEBUG("\n%s: strategy = %d, decoding with %d decoders, temperature = %.2f\n", __func__, params.strategy, n_decoders_cur, t_cur);
+            WHISPER_LOG_DEBUG("\n%s: strategy = %d, decoding with %d decoders, temperature = %.2f\n", __func__, params.strategy, n_decoders_cur, t_cur);

            // TAGS: WHISPER_DECODER_INIT
            for (int j = 0; j < n_decoders_cur; ++j) {
@ -5261,11 +5249,11 @@ int whisper_full_with_state(
                prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());

                // print the prompt
-                WHISPER_PRINT_DEBUG("\n\n");
+                WHISPER_LOG_DEBUG("\n\n");
                for (int i = 0; i < (int) prompt.size(); i++) {
-                    WHISPER_PRINT_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
+                    WHISPER_LOG_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
                }
-                WHISPER_PRINT_DEBUG("\n\n");
+                WHISPER_LOG_DEBUG("\n\n");

                whisper_kv_cache_clear(state->kv_self);

@ -5413,7 +5401,7 @@ int whisper_full_with_state(

                        whisper_kv_cache_seq_cp(state->kv_self, cur.decoder_idx, WHISPER_MAX_DECODERS + j, -1, -1);

-                        WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
+                        WHISPER_LOG_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
                                __func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
                    }

@ -5456,7 +5444,7 @@ int whisper_full_with_state(

                            // do not allow to go back in time
                            if (has_ts && seek_delta > seek_delta_new && result_len < i) {
-                                WHISPER_PRINT_DEBUG("%s: decoder %d: failed due to seek_delta (%d > %d)\n", __func__, j, seek_delta, seek_delta_new);
+                                WHISPER_LOG_DEBUG("%s: decoder %d: failed due to seek_delta (%d > %d)\n", __func__, j, seek_delta, seek_delta_new);
                                failed = true; // TODO: maybe this is not a failure ?
                                continue;
                            }
@ -5471,7 +5459,7 @@ int whisper_full_with_state(
 #ifdef WHISPER_DEBUG
                        {
                            const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
-                            WHISPER_PRINT_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
+                            WHISPER_LOG_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
                                    __func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
                        }
 #endif
@ -5485,7 +5473,7 @@ int whisper_full_with_state(
                                if (seek + seek_delta + 100 >= seek_end) {
                                    result_len = i + 1;
                                } else {
-                                    WHISPER_PRINT_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
+                                    WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
                                    failed = true;
                                    continue;
                                }
@ -5496,7 +5484,7 @@ int whisper_full_with_state(
                                seek_delta = 100*WHISPER_CHUNK_SIZE;
                            }

-                            WHISPER_PRINT_DEBUG("%s: decoder %d completed\n", __func__, j);
+                            WHISPER_LOG_DEBUG("%s: decoder %d completed\n", __func__, j);
                            completed = true;
                            continue;
                        }
@ -5512,7 +5500,7 @@ int whisper_full_with_state(
                    // sometimes, the decoding can get stuck in a repetition loop
                    // this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
                    if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
-                        WHISPER_PRINT_DEBUG("%s: decoder %d: failed due to repetition loop\n", __func__, j);
+                        WHISPER_LOG_DEBUG("%s: decoder %d: failed due to repetition loop\n", __func__, j);
                        failed = true;
                        continue;
                    }
@ -5554,7 +5542,7 @@ int whisper_full_with_state(
                            continue;
                        }

-                        //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, seek_delta %d\n", __func__, j, decoder.sequence.tokens.back().id, decoder.seek_delta);
+                        //WHISPER_LOG_DEBUG("%s: decoder %d: token %d, seek_delta %d\n", __func__, j, decoder.sequence.tokens.back().id, decoder.seek_delta);

                        decoder.i_batch = batch.n_tokens;

@ -5634,11 +5622,11 @@ int whisper_full_with_state(
                    decoder.sequence.tokens.resize(decoder.sequence.result_len);
                    whisper_sequence_score(params, decoder.sequence);

-                    WHISPER_PRINT_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
+                    WHISPER_LOG_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
                            __func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);

                    if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
-                        WHISPER_PRINT_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
+                        WHISPER_LOG_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
                                __func__, j, decoder.sequence.entropy, params.entropy_thold);

                        decoder.failed = true;
@ -5653,7 +5641,7 @@ int whisper_full_with_state(
                    }
                }

-                WHISPER_PRINT_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
+                WHISPER_LOG_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
            }

            bool success = true;
@ -5665,7 +5653,7 @@ int whisper_full_with_state(
                const auto & decoder = state->decoders[best_decoder_id];

                if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
-                    WHISPER_PRINT_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
+                    WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
                    success = false;
                    state->n_fail_p++;
                }
@ -5673,13 +5661,13 @@ int whisper_full_with_state(

            if (success) {
                //for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
-                //    WHISPER_PRINT_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
+                //    WHISPER_LOG_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
                //}

                break;
            }

-            WHISPER_PRINT_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
+            WHISPER_LOG_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
        }

        // output results through a user-provided callback
@ -5691,7 +5679,7 @@ int whisper_full_with_state(

            const auto & tokens_cur = best_decoder.sequence.tokens;

-            //WHISPER_PRINT_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
+            //WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);

            // update prompt_past
            prompt_past.clear();
@ -5811,7 +5799,7 @@ int whisper_full_with_state(
            // update audio window
            seek += seek_delta;

-            WHISPER_PRINT_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
+            WHISPER_LOG_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
        }
    }

@ -6128,7 +6116,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {

    // multi-thread

-    for (uint32_t k = 1; k <= n_threads; k++) {
+    for (int32_t k = 1; k <= n_threads; k++) {
        char * src = (char *) malloc(size);
        char * dst = (char *) malloc(size);

@ -6152,13 +6140,13 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
        const int64_t t0 = ggml_time_us();

        std::vector<std::thread> threads(k - 1);
-        for (uint32_t th = 0; th < k - 1; ++th) {
+        for (int32_t th = 0; th < k - 1; ++th) {
            threads[th] = std::thread(helper, th);
        }

        helper(k - 1);

-        for (uint32_t th = 0; th < k - 1; ++th) {
+        for (int32_t th = 0; th < k - 1; ++th) {
            threads[th].join();
        }
Author	SHA1	Message	Date
Georgi Gerganov	0b9af32a8b	release : v1.5.4	2024-01-05 17:11:27 +02:00
Erik Scholz	11b1b63b14	fix : cuda order of synchronization when setting a buffer (ggml/679) * fix : cuda order of synchronization when setting a buffer * also sync before memcpy --------- Co-authored-by: slaren <slarengh@gmail.com>	2024-01-05 17:01:59 +02:00
Georgi Gerganov	0e26a6c92e	metal : switch back to default.metallib (ggml/681) ggml-ci	2024-01-05 16:31:30 +02:00
Georgi Gerganov	66d8f0b7f1	ggml : fix q2_k bpw in comments (ggml/680)	2024-01-05 16:31:20 +02:00
Yajing Tang	ba5bcde874	coreml : fix ANE optimized encoder (#1716 )	2024-01-04 16:28:30 +02:00
Georgi Gerganov	ab0a8593c5	whisper.swiftui : add .gitignore	2024-01-04 15:00:27 +02:00
Georgi Gerganov	668ffc9b23	whispser : reset the "batched" timings (#1721 )	2024-01-04 13:38:39 +02:00
Georgi Gerganov	9962371f71	release : v1.5.3	2024-01-03 19:36:33 +02:00
Ashraful Islam	993acb5d41	swift : update Package.swift to use ggml as package dependency (#1701 ) * updates Package.swift to use ggml as dependency * cleans up the Package.swift file by removing redundant source files * updates ggml url src to ggerganov	2024-01-03 19:30:26 +02:00
Finn Voorhees	a3d0aa73d1	ggml : add error handling to graph_compute (#1714 )	2024-01-03 15:39:43 +02:00
Georgi Gerganov	14c57952f7	cuda : simplify expression Co-authored-by: slaren <slarengh@gmail.com>	2024-01-03 14:43:51 +02:00
Georgi Gerganov	6c369d6788	cuda : mark I16 and I32 ops as unsupported ggml-ci	2024-01-03 14:43:51 +02:00
Georgi Gerganov	4cdd9aad9b	metal : add kernel_get_rows_i32 ggml-ci	2024-01-03 14:43:51 +02:00
Georgi Gerganov	f38c057503	metal : optimize ggml_mul_mat_id (faster Mixtral PP) (llama/4725) * ggml : disable fast-math for Metal (cmake build only) ggml-ci * metal : fix Metal API debug warnings * cmake : add -fno-inline for Metal build (llama/4545) * metal : fix API debug warnings * metal : fix compile warnings * metal : use uint64_t for strides * cmake : rename option to LLAMA_METAL_SHADER_DEBUG * metal : fix mat-vec Q8_0 kernel for BS > 1 * metal : normalize mat-vec kernel signatures * cmake : respect LLAMA_QKK_64 option * metal : fix mat-vec Q4_K kernel for QK_K == 64 * metal : optimizing ggml_mul_mat_id (wip) * metal : minor fix * metal : opt mul_mm_id	2024-01-03 14:43:51 +02:00
Georgi Gerganov	1e5544b39b	metal : enable shader debugging (cmake option) (llama/4705) * ggml : disable fast-math for Metal (cmake build only) ggml-ci * metal : fix Metal API debug warnings * cmake : add -fno-inline for Metal build (llama/4545) * metal : fix API debug warnings * metal : fix compile warnings * metal : use uint64_t for strides * cmake : rename option to LLAMA_METAL_SHADER_DEBUG * metal : fix mat-vec Q8_0 kernel for BS > 1 * metal : normalize mat-vec kernel signatures * cmake : respect LLAMA_QKK_64 option * metal : fix mat-vec Q4_K kernel for QK_K == 64 ggml-ci	2024-01-03 14:43:51 +02:00
Georgi Gerganov	d5673af79f	ggml : add ggml_vdotq_s32 alias (llama/4715) ggml-ci	2024-01-03 14:43:51 +02:00
Johannes Gäßler	a28dacec65	CUDA: fixed tensor cores not being used on RDNA3 (llama/4697)	2024-01-03 14:43:51 +02:00
automaticcat	dbe29d4e33	ggml : add ggml_cpu_has_avx_vnni() (llama/4589) * feat: add avx_vnni based on intel documents * ggml: add avx vnni based on intel document * llama: add avx vnni information display * docs: add more details about using oneMKL and oneAPI for intel processors * docs: add more details about using oneMKL and oneAPI for intel processors * docs: add more details about using oneMKL and oneAPI for intel processors * docs: add more details about using oneMKL and oneAPI for intel processors * docs: add more details about using oneMKL and oneAPI for intel processors * Update ggml.c Fix indentation upgate Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-01-03 14:43:51 +02:00
Johannes Gäßler	fe3a67c546	CUDA: fix tensor core logic for Pascal and HIP (llama/4682)	2024-01-03 14:43:51 +02:00
hydai	b138ff2be3	cuda: fix vmm oom issue on NVIDIA AGX Orin (llama/4687) Signed-off-by: hydai <hydai@secondstate.io>	2024-01-03 14:43:51 +02:00
Guillaume Wenzek	cf6f1e4181	ggml : extend ggml_get_rows, ggml_repeat, ggml_concat (ggml/639) * add more int ops * ggml_compute_forward_dup_bytes * add tests * PR comments * tests : minor indentations --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2024-01-03 14:43:51 +02:00
Georgi Gerganov	620a223814	scripts : fix sync order + metal sed	2024-01-03 14:43:51 +02:00
Andreu Huguet	f39f9690ec	examples : fix WASM Stack Overflow (#1713 ) Fix for problem: """ RuntimeError: Aborted(Stack overflow! Stack cookie has been overwritten at 0x12be2b10, expected hex dwords 0x89BACDFE and 0x2135467, but received 0x00000000 0x00000000) """ That appears when executing the WASM example with the newer versions.	2024-01-02 16:50:04 +00:00
bobqianic	f9ca90256b	docker : fix the publishing of the CUDA Docker image (#1704 )	2023-12-30 23:12:31 +02:00
Georgi Gerganov	2623640cd6	scripts : do not sync commits from this repo	2023-12-29 15:03:08 +02:00
Tamotsu Takahashi	d87de61ae6	ci : build with CLBlast + ggml-opencl use GGML_API (#1576 ) * Build with CLBlast * Declare GGML_API After rebasing, examples/talk-llama failed: "D:\a\whisper.cpp\whisper.cpp\build\ALL_BUILD.vcxproj" (build target) (1) -> "D:\a\whisper.cpp\whisper.cpp\build\examples\talk-llama\talk-llama.vcxproj" (default target) (14) -> (Link target) -> llama.obj : error LNK2019: unresolved external symbol ggml_cl_free_data referenced in function "public: __cdecl llama_model::~llama_model(void)" (??1llama_model@@QEAA@XZ) [D:\a\whisper.cpp\whisper.cpp\build\examples\talk-llama\talk-llama.vcxproj] llama.obj : error LNK2019: unresolved external symbol ggml_cl_transform_tensor referenced in function "public: void __cdecl llama_model_loader::load_all_data(struct ggml_context ,void (__cdecl)(float,void ),void ,struct llama_mlock *)" (?load_all_data@llama_model_loader@@QEAAXPEAUggml_context@@P6AXMPEAX@Z1PEAUllama_mlock@@@Z) [D:\a\whisper.cpp\whisper.cpp\build\examples\talk-llama\talk-llama.vcxproj] D:\a\whisper.cpp\whisper.cpp\build\bin\Release\talk-llama.exe : fatal error LNK1120: 2 unresolved externals [D:\a\whisper.cpp\whisper.cpp\build\examples\talk-llama\talk-llama.vcxproj]	2023-12-29 12:23:27 +02:00
bobqianic	f5f485f899	whisper : replace `tensor->n_dims` with `ggml_n_dims(tensor)` (#1694 )	2023-12-29 11:38:35 +02:00
Georgi Gerganov	e77b27c331	sync : ggml (VMM, sync-ggml-am, dotprod ARM fixes, CUDA fixes) (#1691 ) * scripts : add sync-ggml-am.sh * sync : ggml (VMM, ARM dot prod fix, etc.) * build : fix CUDA build * ggml : fix some mul mat cases + add tests for src1 F16 `dbd02958fa`	2023-12-29 11:30:47 +02:00
Dimo	a5cc3dc8a2	download : fix large q5 model name (#1695 ) fixed typo in large-v3-q5-0 model name to match HF link	2023-12-29 11:14:32 +02:00
bobqianic	37a709f655	whisper : Replace WHISPER_PRINT_DEBUG with WHISPER_LOG_DEBUG (#1681 )	2023-12-23 12:02:58 +00:00
Georgi Gerganov	3a5302108d	sync : ggml (ggml_scale, ggml_row_size, etc.) (#1677 ) * sync : ggml * sync : llama.cpp * talk-llama : fix obsolete param * ggml-alloc : fix ggml_tallocr_is_own * talk.wasm : update to new ggml * ggml : fix type punning in ggml_scale * ggml : cuda jetson + arm quants warnings	2023-12-22 17:53:39 +02:00
Chaoqun	d2ee117a0a	docker : Dockerize whisper.cpp (#1674 ) * build: add dockerfile for ci * ci: add action to build/push docker image * fix: lowercase repository to fix ci * ci: update cuBLAS flag * build: install curl and ffmped in image * docs: add docker section * fix: improve args check when download model	2023-12-22 11:16:02 +00:00
bobqianic	db8ccdb850	CI : Add coverage for talk-llama when WHISPER_CUBLAS=1 (#1672 )	2023-12-21 22:39:46 +00:00
bobqianic	d2419030b0	examples : Revert CMakeLists.txt for talk-llama (#1669 )	2023-12-21 22:48:52 +02:00
bobqianic	8986690c2a	cmake : set default CUDA architectures (#1667 )	2023-12-21 15:44:04 +02:00
Alfredo Montesinos	9286d3f584	bench.py : add different large models (#1655 ) Amend different large v1,v2,v3 models to benchmark.	2023-12-19 12:40:14 +02:00
Georgi Gerganov	940de9dbe9	wchess : update README.md	2023-12-14 22:00:47 +02:00