chess : tuning performance

wchess: c++17 -> c++11
wchess: off/on prompt
2025-07-01 23:10:47 +02:00 · 2023-11-30 10:50:47 +02:00 · 2023-11-30 08:37:54 +02:00 · 2023-11-30 01:17:29 +02:00 · 2023-11-29 19:30:57 +02:00 · 2023-11-29 18:53:28 +02:00
109 changed files with 9881 additions and 51143 deletions
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -1,38 +0,0 @@
-ARG UBUNTU_VERSION=22.04
-# This needs to generally match the container host's environment.
-ARG CUDA_VERSION=12.3.1
-# Target the CUDA build image
-ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-# Target the CUDA runtime image
-ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
-
-FROM ${BASE_CUDA_DEV_CONTAINER} AS build
-WORKDIR /app
-
-# Unless otherwise specified, we make a fat build.
-ARG CUDA_DOCKER_ARCH=all
-# Set nvcc architecture
-ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
-# Enable cuBLAS
-ENV WHISPER_CUBLAS=1
-
-RUN apt-get update && \
-    apt-get install -y build-essential \
-    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-# Ref: https://stackoverflow.com/a/53464012
-ENV CUDA_MAIN_VERSION=12.3
-ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
-
-COPY .. .
-RUN make
-
-FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
-WORKDIR /app
-
-RUN apt-get update && \
-  apt-get install -y curl ffmpeg \
-  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-COPY --from=build /app /app
-ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -1,19 +0,0 @@
-FROM ubuntu:22.04 AS build
-WORKDIR /app
-
-RUN apt-get update && \
-  apt-get install -y build-essential \
-  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-COPY .. .
-RUN make
-
-FROM ubuntu:22.04 AS runtime
-WORKDIR /app
-
-RUN apt-get update && \
-  apt-get install -y curl ffmpeg \
-  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
-
-COPY --from=build /app /app
-ENTRYPOINT [ "bash", "-c" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -25,7 +25,6 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
            apt install -y build-essential libsdl2-dev
            make
@ -87,7 +86,6 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
            apt install -y build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
@ -115,9 +113,8 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
-            apt install -y clang build-essential cmake libsdl2-dev
+            apt install -y build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
            make
            ctest -L gh --output-on-failure'
@ -143,7 +140,6 @@ jobs:
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
-            set -e
            apt update
            apt install -y build-essential cmake
            cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
@ -166,7 +162,7 @@ jobs:
            s2arc: x64
            jnaPath: win32-x86-64
          - sdl2: ON
-            s2ver: 2.28.5
+            s2ver: 2.26.0

    steps:
      - name: Clone
@ -221,16 +217,13 @@ jobs:
        sdl2: [ON]
        include:
          - arch: Win32
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x86.zip
            s2arc: x86
-            clblast: OFF
          - arch: x64
-            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
+            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.24/OpenBLAS-0.3.24-x64.zip
            s2arc: x64
-            clblast: ON
-            clver: 1.6.1
          - sdl2: ON
-            s2ver: 2.28.5
+            s2ver: 2.26.0

    steps:
      - name: Clone
@ -255,18 +248,6 @@ jobs:
          7z x sdl2.zip
          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV

-      - name: Install OpenCL
-        if: matrix.clblast == 'ON'
-        run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
-
-      - name: Fetch CLBlast and set CLBlast_DIR
-        if: matrix.clblast == 'ON'
-        run: |
-          C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
-          7z x clblast.zip
-          7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
-          echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
-
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
@ -274,7 +255,6 @@ jobs:
          -DWHISPER_OPENBLAS=${{ matrix.blas }}
          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
-          -DWHISPER_CLBLAST=${{ matrix.clblast }}

      - name: Build
        run: |
@ -289,15 +269,11 @@ jobs:
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

-      - name: Copy clblast.dll
-        if: matrix.clblast == 'ON'
-        run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
-
      - name: Upload binaries
        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v1
        with:
-          name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
+          name: whisper-blas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}

  windows-cublas:
@ -309,12 +285,11 @@ jobs:
        arch: [x64]
        cublas: [ON]
        sdl2: [ON]
-        cuda-toolkit: [12.2.0, 11.8.0]
        include:
          - arch: x64
            s2arc: x64
          - sdl2: ON
-            s2ver: 2.28.5
+            s2ver: 2.26.0

    steps:
      - name: Clone
@ -325,9 +300,7 @@ jobs:

      - name: Install CUDA Toolkit
        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.11
-        with:
-          cuda: '${{ matrix.cuda-toolkit }}'
+        uses: Jimver/cuda-toolkit@v0.2.10

      - name: Fetch SDL2 and set SDL2_DIR
        if: matrix.sdl2 == 'ON'
@ -340,13 +313,12 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_CUBLAS=${{ matrix.cublas }}
-          -DWHISPER_SDL2=${{ matrix.sdl2 }}
+          -DWHISPER_CUBLAS=1

-      - name: Build ${{ matrix.cuda-toolkit }}
+      - name: Build
        run: |
          cd ./build
-          cmake --build . --config ${{ matrix.build }}
+          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}

      - name: Copy CUDA DLLs
        run: >
@ -363,7 +335,7 @@ jobs:
        if: matrix.sdl2 == 'ON'
        uses: actions/upload-artifact@v1
        with:
-          name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
+          name: whisper-cublas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}

  emscripten:
@ -416,14 +388,6 @@ jobs:
    steps:
      - name: Clone
        uses: actions/checkout@v3
-        with:
-          path: whisper
-
-      - name: Clone
-        uses: actions/checkout@v3
-        with:
-          repository: ggerganov/ggml
-          path: ggml

      - name: Install Java
        uses: actions/setup-java@v3
@ -436,15 +400,9 @@ jobs:

      - name: Build
        run: |
-          cd whisper/examples/whisper.android
+          cd examples/whisper.android
          ./gradlew assembleRelease --no-daemon

-      - name: Build with external ggml
-        run: |
-          export PATH_TO_GGML=$PWD/ggml
-          cd whisper/examples/whisper.android
-          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
-
  android_java:
    runs-on: ubuntu-latest

--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -1,57 +0,0 @@
-name: Publish Docker image
-
-on:
-  pull_request:
-  push:
-    branches:
-      - master
-
-jobs:
-  push_to_registry:
-    name: Push Docker image to Docker Hub
-    if: github.event.pull_request.draft == false
-
-    runs-on: ubuntu-latest
-    env:
-      COMMIT_SHA: ${{ github.sha }}
-    strategy:
-      matrix:
-        config:
-          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
-          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
-
-    steps:
-      - name: Check out the repo
-        uses: actions/checkout@v3
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Log in to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Build and push Docker image (versioned)
-        if: github.event_name == 'push'
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          push: true
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
-          file: ${{ matrix.config.dockerfile }}
-
-      - name: Build and push Docker image (tagged)
-        uses: docker/build-push-action@v4
-        with:
-          context: .
-          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
-          file: ${{ matrix.config.dockerfile }}
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.5)

-project(whisper.cpp VERSION 1.5.4)
+project(whisper.cpp VERSION 1.5.0)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -68,7 +68,6 @@ if (APPLE)
    option(WHISPER_METAL_NDEBUG          "whisper: disable Metal debugging"      OFF)
    option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
-    option(WHISPER_METAL_EMBED_LIBRARY   "whisper: embed Metal library"          OFF)
 else()
    option(WHISPER_BLAS                  "whisper: use BLAS libraries"  OFF)
    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor" Generic)
@ -148,30 +147,6 @@ if (APPLE)

        # copy ggml-metal.metal to bin directory
        configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
-
-        if (WHISPER_METAL_EMBED_LIBRARY)
-            enable_language(ASM)
-            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_EMBED_LIBRARY)
-
-            set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
-
-            file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
-            set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
-
-            add_custom_command(
-                OUTPUT ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
-                COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
-                DEPENDS ${METALLIB_SOURCE}
-                COMMENT "Generate assembly for embedded Metal library"
-            )
-
-            set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
-        endif()
    endif()

    if (WHISPER_COREML)
@ -243,17 +218,11 @@ if (WHISPER_CUBLAS)
        add_compile_definitions(GGML_USE_CUBLAS)

        if (WHISPER_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
        else()
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()

-        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
    else()
        message(FATAL_ERROR "cuBLAS not found")
    endif()
@ -340,8 +309,7 @@ if (WHISPER_ALL_WARNINGS)
 endif()

 if (NOT MSVC)
-    # TODO: temporary disabled until we figure out ggml-metal.m
-    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
 endif()

@ -370,8 +338,8 @@ else()
        endif()
    else()
        if (EMSCRIPTEN)
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
        else()
            if(NOT WHISPER_NO_AVX)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
@ -530,7 +498,6 @@ else()
 endif()

 if (BUILD_SHARED_LIBS)
-    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_link_libraries(${TARGET} PUBLIC
        ${CMAKE_DL_LIBS}
        )
@ -554,13 +521,7 @@ endif()

 if (GGML_SOURCES_CUDA)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
-    # Only configure gmml CUDA architectures is not globally set
-    if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
-        # Not overriden by user, so set defaults
-        set(GGML_CUDA_ARCHITECTURES 52 61 70)
-    endif()
-    message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
-    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
+    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
 endif()

@ -572,7 +533,7 @@ target_compile_definitions(${TARGET} PUBLIC
    ${WHISPER_EXTRA_FLAGS}
    )

-set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "ggml.h;whisper.h")
+set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")

 include(GNUInstallDirs)

--- a/44
+++ b/44
@ -42,12 +42,6 @@ CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

-ifdef MACOSX_DEPLOYMENT_TARGET
-	CFLAGS   += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
-	CXXFLAGS += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
-	LDFLAGS  += -mmacosx-version-min=$(MACOSX_DEPLOYMENT_TARGET)
-endif
-
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
@ -105,16 +99,6 @@ ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$
 	CXXFLAGS += -pthread
 endif

-# detect Windows
-ifneq ($(findstring _NT,$(UNAME_S)),)
-	_WIN32 := 1
-endif
-
-# Windows Sockets 2 (Winsock) for network-capable apps
-ifeq ($(_WIN32),1)
-	LWINSOCK2 := -lws2_32
-endif
-
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
@ -123,7 +107,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 		CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
 	else ifeq ($(UNAME_S),Linux)
 		CPUINFO_CMD := cat /proc/cpuinfo
-	else ifneq (,$(filter MINGW32_NT% MINGW64_NT% MSYS_NT%,$(UNAME_S)))
+	else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
 		CPUINFO_CMD := cat /proc/cpuinfo
 	else ifneq (,$(filter DragonFly FreeBSD,$(UNAME_S)))
 		CPUINFO_CMD := grep Features /var/run/dmesg.boot
@ -215,14 +199,14 @@ endif

 ifdef WHISPER_CUBLAS
 	ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
-		CUDA_ARCH_FLAG ?= native
+		CUDA_ARCH_FLAG=native
 	else
-		CUDA_ARCH_FLAG ?= all
+		CUDA_ARCH_FLAG=all
 	endif

 	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
-	LDFLAGS     += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
+	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
 	WHISPER_OBJ += ggml-cuda.o
 	NVCC        = nvcc
 	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
@ -345,24 +329,6 @@ ggml-metal.o: ggml-metal.m ggml-metal.h
 	$(CC) $(CFLAGS) -c $< -o $@

 WHISPER_OBJ += ggml-metal.o
-
-ifdef WHISPER_METAL_EMBED_LIBRARY
-CFLAGS += -DGGML_METAL_EMBED_LIBRARY
-
-ggml-metal-embed.o: ggml-metal.metal
-	@echo "Embedding Metal library"
-	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"$<\"" >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
-	@$(AS) $(TEMP_ASSEMBLY) -o $@
-	@rm -f ${TEMP_ASSEMBLY}
-
-WHISPER_OBJ += ggml-metal-embed.o
-endif
 endif

 libwhisper.a: $(WHISPER_OBJ)
@ -394,7 +360,7 @@ quantize: examples/quantize/quantize.cpp $(WHISPER_OBJ) $(SRC_COMMON)
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o quantize $(LDFLAGS)

 server: examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS) $(LWINSOCK2)
+	$(CXX) $(CXXFLAGS) examples/server/server.cpp $(SRC_COMMON) $(WHISPER_OBJ) -o server $(LDFLAGS)

 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
--- a/Package.swift
+++ b/Package.swift
@ -2,26 +2,41 @@

 import PackageDescription

+#if arch(arm) || arch(arm64)
+let platforms: [SupportedPlatform]? = [
+    .macOS(.v12),
+    .iOS(.v14),
+    .watchOS(.v4),
+    .tvOS(.v14)
+]
+let exclude: [String] = []
+let resources: [Resource] = [
+    .process("ggml-metal.metal")
+]
+let additionalSources: [String] = ["ggml-metal.m"]
+let additionalSettings: [CSetting] = [
+    .unsafeFlags(["-fno-objc-arc"]),
+    .define("GGML_USE_METAL")
+]
+#else
+let platforms: [SupportedPlatform]? = nil
+let exclude: [String] = ["ggml-metal.metal"]
+let resources: [Resource] = []
+let additionalSources: [String] = []
+let additionalSettings: [CSetting] = []
+#endif
+
 let package = Package(
    name: "whisper",
-    platforms: [
-        .macOS(.v12),
-        .iOS(.v14),
-        .watchOS(.v4),
-        .tvOS(.v14)
-    ],
+    platforms: platforms,
    products: [
        .library(name: "whisper", targets: ["whisper"]),
    ],
-    dependencies: [
-        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
-    ],
    targets: [
        .target(
            name: "whisper",
-            dependencies: ["ggml"],
            path: ".",
-            exclude: [
+            exclude: exclude + [
               "bindings",
               "cmake",
               "coreml",
@ -36,20 +51,23 @@ let package = Package(
               "Makefile"
            ],
            sources: [
+                "ggml.c",
                "whisper.cpp",
-            ],
+                "ggml-alloc.c",
+                "ggml-backend.c",
+                "ggml-quants.c"
+            ] + additionalSources,
+            resources: resources,
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
-                .define("GGML_USE_ACCELERATE"),
-                .unsafeFlags(["-fno-objc-arc"]),
-                .define("GGML_USE_METAL")
+                .define("GGML_USE_ACCELERATE")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
-            ],
+            ] + additionalSettings,
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
--- a/README.md
+++ b/README.md
@ -6,7 +6,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.5.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.5.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -33,10 +33,9 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)

 The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
-The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
+The rest of the code is part of the [ggml](https://github.com/ggerganov/ggml) machine learning library.

 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
 As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
@ -61,22 +60,22 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder

-The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
+The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
+intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
+the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.

 ## Quick start

-First clone the repository:
+First clone the repository.

-```bash
-git clone https://github.com/ggerganov/whisper.cpp.git
-```
-
-Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
+Then, download one of the Whisper models converted in [ggml format](models). For example:

 ```bash
 bash ./models/download-ggml-model.sh base.en
 ```

+If you wish to convert the Whisper models to ggml format yourself, instructions are in [models/README.md](models/README.md).
+
 Now build the [main](examples/main) example and transcribe an audio file like this:

 ```bash
@ -91,7 +90,7 @@ make

 For a quick demo, simply run `make base.en`:

-```text
+```java
 $ make base.en

 cc  -I.              -O3 -std=c11   -pthread -DGGML_USE_ACCELERATE   -c ggml.c -o ggml.o
@ -111,8 +110,8 @@ options:
  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
  -ml N,     --max-len N         [0      ] maximum segment length in characters
  -sow,      --split-on-word     [false  ] split on word rather than on token
-  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -bs N,     --beam-size N       [5      ] beam size for beam search
+  -bo N,     --best-of N         [2      ] number of best candidates to keep
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
@ -129,7 +128,6 @@ options:
  -fp,       --font-path         [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -oj,       --output-json       [false  ] output result in a JSON file
-  -ojf,      --output-json-full  [false  ] include more information in the JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
@ -141,8 +139,7 @@ options:
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -f FNAME,  --file FNAME        [       ] input WAV file path
  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
-  -ls,       --log-score         [false  ] log best decoder scores of tokens
-  -ng,       --no-gpu            [false  ] disable GPU
+  -ls,       --log-score         [false  ] log best decoder scores of token


 bash ./models/download-ggml-model.sh base.en
@ -207,7 +204,7 @@ For detailed usage instructions, run: `./main -h`
 Note that the [main](examples/main) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
 For example, you can use `ffmpeg` like this:

-```bash
+```java
 ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
 ```

@ -239,9 +236,9 @@ make large-v3

 ## Memory usage

-| Model  | Disk    | Mem     |
-| ------ | ------- | ------- |
-| tiny   | 75 MiB  | ~273 MB |
+| Model  | Disk    | Mem      |
+| ---    | ---     | ---      |
+| tiny   |  75 MiB | ~273 MB |
 | base   | 142 MiB | ~388 MB |
 | small  | 466 MiB | ~852 MB |
 | medium | 1.5 GiB | ~2.1 GB |
@ -278,7 +275,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

  - To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
  - Python 3.10 is recommended.
-  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
+  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html)  for this step:
    - To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
    - To activate the environment, use: `conda activate py310-whisper`

@ -304,8 +301,8 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

 - Run the examples as usual. For example:

-  ```text
-  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
+  ```bash
+  ./main -m models/ggml-base.en.bin -f samples/jfk.wav

  ...

@ -333,8 +330,7 @@ This can result in significant speedup in encoder performance. Here are the inst
 - First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.

  Windows:
-
-  ```powershell
+  ```
  cd models
  python -m venv openvino_conv_env
  openvino_conv_env\Scripts\activate
@ -343,8 +339,7 @@ This can result in significant speedup in encoder performance. Here are the inst
  ```

  Linux and macOS:
-
-  ```bash
+  ```
  cd models
  python3 -m venv openvino_conv_env
  source openvino_conv_env/bin/activate
@ -358,7 +353,7 @@ This can result in significant speedup in encoder performance. Here are the inst
  python convert-whisper-to-openvino.py --model base.en
  ```

-  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as `ggml` models, as that
+  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as ggml models, as that
  is the default location that the OpenVINO extension will search at runtime.

 - Build `whisper.cpp` with OpenVINO support:
@ -368,28 +363,24 @@ This can result in significant speedup in encoder performance. Here are the inst
  After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:

  Linux:
-
  ```bash
  source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
  ```

  Windows (cmd):
-
-  ```powershell
+  ```
  C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
  ```

  And then build the project using cmake:
-
  ```bash
  cmake -B build -DWHISPER_OPENVINO=1
  cmake --build build -j --config Release
  ```

 - Run the examples as usual. For example:
-
-  ```text
-  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
+  ```bash
+  ./main -m models/ggml-base.en.bin -f samples/jfk.wav

  ...

@ -440,6 +431,7 @@ cmake -B build -DWHISPER_CLBLAST=ON
 cmake --build build -j --config Release
 ```

+
 Run all the examples as usual.

 ## BLAS CPU support via OpenBLAS
@ -454,38 +446,6 @@ make clean
 WHISPER_OPENBLAS=1 make -j
 ```

-## Docker
-
-### Prerequisites
-
- Docker must be installed and running on your system.
- Create a folder to store big models & intermediate files (ex. /whisper/models)
-
-### Images
-
-We have two Docker images available for this project:
-
-1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
-
-### Usage
-
-```shell
-# download model and persist it in a local folder
-docker run -it --rm \
-  -v path/to/models:/models \
-  whisper.cpp:main "./models/download-ggml-model.sh base /models"
-# transcribe an audio file
-docker run -it --rm \
-  -v path/to/models:/models \
-  -v path/to/audios:/audios \
-  whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
-# transcribe an audio file in samples folder
-docker run -it --rm \
-  -v path/to/models:/models \
-  whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
-```
-
 ## Limitations

 - Inference only
@ -498,7 +458,7 @@ in about half a minute on a MacBook M1 Pro, using `medium.en` model:
 <details>
  <summary>Expand to see the result</summary>

-```text
+```java
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8

 whisper_init_from_file: loading model from 'models/ggml-medium.en.bin'
@ -570,7 +530,6 @@ whisper_print_timings:   encode time = 18665.10 ms /     9 runs ( 2073.90 ms per
 whisper_print_timings:   decode time = 13090.93 ms /   549 runs (   23.85 ms per run)
 whisper_print_timings:    total time = 32733.52 ms
 ```
-
 </details>

 ## Real-time audio input example
@ -579,7 +538,7 @@ This is a naive example of performing real-time inference on audio from your mic
 The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

-```bash
+```java
 make stream
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```
@ -591,7 +550,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a
 Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy
 to highlight words with high or low confidence:

-```bash
+```java
 ./main -m models/ggml-base.en.bin -f samples/gb0.wav --print-colors
 ```

@ -601,8 +560,8 @@ to highlight words with high or low confidence:

 For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`:

-```text
-$ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
+```java
+./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
@ -625,8 +584,8 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr

 The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:

-```text
-$ ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
+```java
+./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1

 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
@ -696,7 +655,7 @@ This requires to have `ffmpeg` installed.

 Here are a few *"typical"* examples:

-```bash
+```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
 source ./samples/jfk.wav.wts
 ffplay ./samples/jfk.wav.mp4
@ -706,7 +665,7 @@ https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b

 ---

-```bash
+```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
 source ./samples/mm0.wav.wts
 ffplay ./samples/mm0.wav.mp4
@ -716,7 +675,7 @@ https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-9

 ---

-```bash
+```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
 source ./samples/gb0.wav.wts
 ffplay ./samples/gb0.wav.mp4
@ -730,7 +689,7 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a

 Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:

-```bash
+```java
 ./extra/bench-wts.sh samples/jfk.wav
 ffplay ./samples/jfk.wav.all.mp4
 ```
@ -759,7 +718,8 @@ It is written in python with the intention of being easy to modify and extend fo

 It outputs a csv file with the results of the benchmarking.

-## `ggml` format
+
+## ggml format

 The original models are converted to a custom binary format. This allows to pack everything needed into a single file:

@ -774,50 +734,49 @@ or manually from here:
 - https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

-For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).
+For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
+in [models](models).

 ## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)

- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
+- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
+- [X] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [x] Java:
+- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
+- [X] Java:
  - [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
+- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
+- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
  - [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
+- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
+- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
+- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
+- [X] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)

 ## Examples

 There are various examples of using the library for different projects in the [examples](examples) folder.
 Some of the examples are even ported to run in the browser using WebAssembly. Check them out!

-| Example                                             | Web                                   | Description                                                                                                                     |
-| --------------------------------------------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------- |
-| [main](examples/main)                               | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper                                                                       |
-| [bench](examples/bench)                             | [bench.wasm](examples/bench.wasm)     | Benchmark the performance of Whisper on your machine                                                                            |
-| [stream](examples/stream)                           | [stream.wasm](examples/stream.wasm)   | Real-time transcription of raw microphone capture                                                                               |
-| [command](examples/command)                         | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic                                                         |
-| [wchess](examples/wchess)                           | [wchess.wasm](examples/wchess)        | Voice-controlled chess                                                                                                          |
-| [talk](examples/talk)                               | [talk.wasm](examples/talk.wasm)       | Talk with a GPT-2 bot                                                                                                           |
-| [talk-llama](examples/talk-llama)                   |                                       | Talk with a LLaMA bot                                                                                                           |
-| [whisper.objc](examples/whisper.objc)               |                                       | iOS mobile application using whisper.cpp                                                                                        |
-| [whisper.swiftui](examples/whisper.swiftui)         |                                       | SwiftUI iOS / macOS application using whisper.cpp                                                                               |
-| [whisper.android](examples/whisper.android)         |                                       | Android mobile application using whisper.cpp                                                                                    |
-| [whisper.nvim](examples/whisper.nvim)               |                                       | Speech-to-text plugin for Neovim                                                                                                |
-| [generate-karaoke.sh](examples/generate-karaoke.sh) |                                       | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture                           |
-| [livestream.sh](examples/livestream.sh)             |                                       | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185)                                           |
-| [yt-wsp.sh](examples/yt-wsp.sh)                     |                                       | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
-| [server](examples/server)                           |                                       | HTTP transcription server with OAI-like API                                                                                     |
+| Example | Web | Description |
+| ---     | --- | ---         |
+| [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
+| [bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
+| [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
+| [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
+| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
+| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
+| [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
+| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
+| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
+| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
+| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
+| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
+| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |

 ## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)

--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -1,26 +1,9 @@
-ifndef UNAME_S
-UNAME_S := $(shell uname -s)
-endif
-
-ifndef UNAME_P
-UNAME_P := $(shell uname -p)
-endif
-
-ifndef UNAME_M
-UNAME_M := $(shell uname -m)
-endif
-
-GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
 INCLUDE_PATH := $(abspath ../..)
 LIBRARY_PATH := $(abspath ../..)

-ifeq ($(UNAME_S),Darwin)
-	EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
-endif
-
 all: clean whisper examples

 whisper: mkdir
@ -28,13 +11,8 @@ whisper: mkdir
 	@${MAKE} -C ../.. libwhisper.a

 test: model-small whisper modtidy
-ifeq ($(UNAME_S),Darwin)
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v .
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go test -ldflags "-extldflags '$(EXT_LDFLAGS)'" -v ./pkg/whisper/...
-else
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v .
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./pkg/whisper/...
-endif

 examples: $(EXAMPLES_DIR)

@ -43,11 +21,7 @@ model-small: mkdir examples/go-model-download

 $(EXAMPLES_DIR): mkdir whisper modtidy
 	@echo Build example $(notdir $@)
-ifeq ($(UNAME_S),Darwin)
-	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} GGML_METAL_PATH_RESOURCES=${GGML_METAL_PATH_RESOURCES} go build ${BUILD_FLAGS} -ldflags "-extldflags '$(EXT_LDFLAGS)'" -o ${BUILD_DIR}/$(notdir $@) ./$@
-else
 	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@
-endif

 mkdir:
 	@echo Mkdir ${BUILD_DIR}
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -123,11 +123,6 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }

-// Set initial prompt
-func (p *Params) SetInitialPrompt(prompt string) {
-	p.initial_prompt = C.CString(prompt)
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS

@ -152,7 +147,6 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
-	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
 	if p.translate {
 		str += " translate"
 	}
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -130,11 +130,6 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }

-// Set initial prompt
-func (context *context) SetInitialPrompt(prompt string) {
-	context.params.SetInitialPrompt(prompt)
-}
-
 // ResetTimings resets the mode timings. Should be called before processing
 func (context *context) ResetTimings() {
 	context.model.ctx.Whisper_reset_timings()
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -38,18 +38,17 @@ type Context interface {
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language

-	SetOffset(time.Duration)        // Set offset
-	SetDuration(time.Duration)      // Set duration
-	SetThreads(uint)                // Set number of threads to use
-	SetSpeedup(bool)                // Set speedup flag
-	SetSplitOnWord(bool)            // Set split on word flag
-	SetTokenThreshold(float32)      // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)       // Set max segment length in characters
-	SetTokenTimestamps(bool)        // Set token timestamps flag
-	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
-	SetAudioCtx(uint)               // Set audio encoder context
-	SetInitialPrompt(prompt string) // Set initial prompt
+	SetOffset(time.Duration)      // Set offset
+	SetDuration(time.Duration)    // Set duration
+	SetThreads(uint)              // Set number of threads to use
+	SetSpeedup(bool)              // Set speedup flag
+	SetSplitOnWord(bool)          // Set split on word flag
+	SetTokenThreshold(float32)    // Set timestamp token probability threshold
+	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
+	SetMaxSegmentLength(uint)     // Set max segment length in characters
+	SetTokenTimestamps(bool)      // Set token timestamps flag
+	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)
+	SetAudioCtx(uint)             // Set audio encoder context

 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/README.md
+++ b/bindings/javascript/README.md
@ -41,7 +41,7 @@ make publish-npm

 ## Sample run

-```text
+```java
 $ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js

 whisper_model_load: loading model from 'whisper.bin'
@ -63,7 +63,7 @@ whisper_model_load: ggml ctx size =  140.60 MB
 whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB

-system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
+system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 | 

 operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...

--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.5.4",
+  "version": "1.5.0",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@ -70,7 +70,7 @@ extern "C" {
        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

        // compute graph without a plan
-        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
    backend->iface.graph_plan_compute(backend, plan);
 }

-bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    return backend->iface.graph_compute(backend, cgraph);
+void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    backend->iface.graph_compute(backend, cgraph);
 }

 bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@ -52,7 +52,7 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -24,9 +24,9 @@ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {

    // select which device to run the Core ML model on
    MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
-    // config.computeUnits = MLComputeUnitsCPUAndGPU;
+    config.computeUnits = MLComputeUnitsCPUAndGPU;
    //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
-    config.computeUnits = MLComputeUnitsAll;
+    //config.computeUnits = MLComputeUnitsAll;

    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);

--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -14,10 +14,6 @@ if (WHISPER_SDL2)
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()

-if (WHISPER_CLBLAST)
-    find_package(CLBlast REQUIRED)
-endif()
-
 # common

 set(TARGET common)
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -154,7 +154,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -58,7 +58,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
 int whisper_bench_full(const whisper_params & params) {
    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -693,7 +693,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -62,9 +62,6 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
-        case GGML_FTYPE_MOSTLY_IQ2_XXS:
-        case GGML_FTYPE_MOSTLY_IQ2_XS:
-        case GGML_FTYPE_MOSTLY_IQ3_XXS:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -185,7 +182,7 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_Q5_K:
                case GGML_TYPE_Q6_K:
                    {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
+                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements, hist_cur.data());
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
@ -194,9 +191,6 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_I32:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_Q8_K:
-                case GGML_TYPE_IQ2_XXS:
-                case GGML_TYPE_IQ2_XS:
-                case GGML_TYPE_IQ3_XXS:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -615,21 +615,6 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(

 }

-bool is_wav_buffer(const std::string buf) {
-    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
-    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
-    if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
-        return false;
-    }
-
-    uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
-    if (chunk_size + 8 != buf.size()) {
-        return false;
-    }
-
-    return true;
-}
-
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin
@ -654,12 +639,6 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector

        fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
    }
-    else if (is_wav_buffer(fname)) {
-        if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
-            fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
-            return false;
-        }
-    }
    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
        return false;
--- a/examples/common.h
+++ b/examples/common.h
@ -135,11 +135,7 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 // Audio utils
 //

-// Check if a buffer is a WAV audio file
-bool is_wav_buffer(const std::string buf);
-
 // Read WAV audio file and store the PCM data into pcmf32
-// fname can be a buffer of WAV data instead of a filename
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
 bool read_wav(
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -22,7 +22,6 @@ var printTextarea = (function() {
 async function clearCache() {
    if (confirm('Are you sure you want to clear the cache?\nAll the models will be downloaded again.')) {
        indexedDB.deleteDatabase(dbName);
-        location.reload();
    }
 }

--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -435,7 +435,7 @@ int main(int argc, char ** argv) {
    }

    // whisper init
-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    // init audio
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -17,37 +17,28 @@ options:
  -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
  -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
  -ml N,     --max-len N         [0      ] maximum segment length in characters
-  -sow,      --split-on-word     [false  ] split on word rather than on token
  -bo N,     --best-of N         [5      ] number of best candidates to keep
-  -bs N,     --beam-size N       [5      ] beam size for beam search
+  -bs N,     --beam-size N       [-1     ] beam size for beam search
  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
-  -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
+  -su,       --speed-up          [false  ] speed up audio by x2 (reduced accuracy)
  -tr,       --translate         [false  ] translate from source language to english
  -di,       --diarize           [false  ] stereo audio diarization
-  -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
  -otxt,     --output-txt        [false  ] output result in a text file
  -ovtt,     --output-vtt        [false  ] output result in a vtt file
  -osrt,     --output-srt        [false  ] output result in a srt file
-  -olrc,     --output-lrc        [false  ] output result in a lrc file
  -owts,     --output-words      [false  ] output script for generating karaoke video
-  -fp,       --font-path         [/System/Library/Fonts/Supplemental/Courier New Bold.ttf] path to a monospace font for karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -oj,       --output-json       [false  ] output result in a JSON file
-  -ojf,      --output-json-full  [false  ] include more information in the JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
  -pp,       --print-progress    [false  ] print progress
-  -nt,       --no-timestamps     [false  ] do not print timestamps
+  -nt,       --no-timestamps     [true   ] do not print timestamps
  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
-  -dl,       --detect-language   [false  ] exit after automatically detecting language
             --prompt PROMPT     [       ] initial prompt
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -f FNAME,  --file FNAME        [       ] input WAV file path
-  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
-  -ls,       --log-score         [false  ] log best decoder scores of tokens
-  -ng,       --no-gpu            [false  ] disable GPU
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -64,7 +64,6 @@ struct whisper_params {
    int32_t max_len      =  0;
    int32_t best_of      = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
    int32_t beam_size    = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
-    int32_t audio_ctx   = 0;

    float word_thold    =  0.01f;
    float entropy_thold =  2.40f;
@ -86,7 +85,6 @@ struct whisper_params {
    bool output_jsn      = false;
    bool output_jsn_full = false;
    bool output_lrc      = false;
-    bool no_prints       = false;
    bool print_special   = false;
    bool print_colors    = false;
    bool print_progress  = false;
@ -137,7 +135,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-ac"   || arg == "--audio-context")   { params.audio_ctx       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
@ -158,7 +155,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-oj"   || arg == "--output-json")     { params.output_jsn      = true; }
        else if (arg == "-ojf"  || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
        else if (arg == "-of"   || arg == "--output-file")     { params.fname_out.emplace_back(argv[++i]); }
-        else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints       = true; }
        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
@ -197,7 +193,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -ac N,     --audio-ctx N       [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
@ -217,7 +212,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -ojf,      --output-json-full  [%-7s] include more information in the JSON file\n",      params.output_jsn_full ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
-    fprintf(stderr, "  -np,       --no-prints         [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
@ -858,9 +852,6 @@ bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

-
-void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
-
 int main(int argc, char ** argv) {
    whisper_params params;

@ -887,13 +878,9 @@ int main(int argc, char ** argv) {
        exit(0);
    }

-    if (params.no_prints) {
-        whisper_log_set(cb_log_disable, NULL);
-    }
-
    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -918,25 +905,26 @@ int main(int argc, char ** argv) {
            continue;
        }

-        if (!whisper_is_multilingual(ctx)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        if (params.detect_language) {
-            params.language = "auto";
-        }
-
-        if (!params.no_prints) {
-            // print system information
+        // print system information
+        {
            fprintf(stderr, "\n");
            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
+        }

-            // print some info about the processing
+        // print some info about the processing
+        {
            fprintf(stderr, "\n");
+            if (!whisper_is_multilingual(ctx)) {
+                if (params.language != "en" || params.translate) {
+                    params.language = "en";
+                    params.translate = false;
+                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+                }
+            }
+            if (params.detect_language) {
+                params.language = "auto";
+            }
            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, %d beams + best of %d, lang = %s, task = %s, %stimestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, params.n_processors, params.beam_size, params.best_of,
@ -970,7 +958,6 @@ int main(int argc, char ** argv) {
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
-            wparams.audio_ctx        = params.audio_ctx;

            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;
@ -986,8 +973,6 @@ int main(int argc, char ** argv) {
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;

-            wparams.no_timestamps    = params.no_timestamps;
-
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

            // this callback is called on each new segment
--- a/examples/python/test_whisper_processor.py
+++ b/examples/python/test_whisper_processor.py
@ -1,7 +0,0 @@
-import whisper_processor
-
-try:
-    result = whisper_processor.process_audio("./audio/wake_word_detected16k.wav", "base.en")
-    print(result)
-except Exception as e:
-    print(f"Error: {e}")
--- a/examples/python/whisper_processor.py
+++ b/examples/python/whisper_processor.py
@ -1,54 +0,0 @@
-import subprocess
-import sys
-import os
-
-def process_audio(wav_file, model_name="base.en"):
-    """
-    Processes an audio file using a specified model and returns the processed string.
-
-    :param wav_file: Path to the WAV file
-    :param model_name: Name of the model to use
-    :return: Processed string output from the audio processing
-    :raises: Exception if an error occurs during processing
-    """
-
-    model = f"./models/ggml-{model_name}.bin"
-
-    # Check if the file exists
-    if not os.path.exists(model):
-        raise FileNotFoundError(f"Model file not found: {model} \n\nDownload a model with this command:\n\n> bash ./models/download-ggml-model.sh {model_name}\n\n")
-
-    if not os.path.exists(wav_file):
-        raise FileNotFoundError(f"WAV file not found: {wav_file}")
-
-    full_command = f"./main -m {model} -f {wav_file} -np -nt"
-
-    # Execute the command
-    process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
-    # Get the output and error (if any)
-    output, error = process.communicate()
-
-    if error:
-        raise Exception(f"Error processing audio: {error.decode('utf-8')}")
-
-    # Process and return the output string
-    decoded_str = output.decode('utf-8').strip()
-    processed_str = decoded_str.replace('[BLANK_AUDIO]', '').strip()
-
-    return processed_str
-
-def main():
-    if len(sys.argv) >= 2:
-        wav_file = sys.argv[1]
-        model_name = sys.argv[2] if len(sys.argv) == 3 else "base.en"
-        try:
-            result = process_audio(wav_file, model_name)
-            print(result)
-        except Exception as e:
-            print(f"Error: {e}")
-    else:
-        print("Usage: python whisper_processor.py <wav_file> [<model_name>]")
-
-if __name__ == "__main__":
-    main()
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -4,7 +4,3 @@ add_executable(${TARGET} server.cpp httplib.h json.hpp)
 include(DefaultTargetOptions)

 target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
-
-if (WIN32)
-    target_link_libraries(${TARGET} PRIVATE ws2_32)
-endif()
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -2,10 +2,6 @@

 Simple http server. WAV Files are passed to the inference model via http requests.

-https://github.com/ggerganov/whisper.cpp/assets/1991296/e983ee53-8741-4eb5-9048-afe5e4594b8f
-
-## Usage
-
 ```
 ./server -h

@ -33,7 +29,6 @@ options:
  -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
-  -pr,       --print-realtime    [false  ] print output in realtime
  -pp,       --print-progress    [false  ] print progress
  -nt,       --no-timestamps     [false  ] do not print timestamps
  -l LANG,   --language LANG     [en     ] spoken language ('auto' for auto-detect)
@ -43,12 +38,8 @@ options:
  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
  --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
  --port PORT,                   [8080   ] Port number for the server
-  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
 ```

-> [!WARNING]
-> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
-
 ## request examples

 **/inference**
@ -56,9 +47,8 @@ options:
 curl 127.0.0.1:8080/inference \
 -H "Content-Type: multipart/form-data" \
 -F file="@<file-path>" \
-F temperature="0.0" \
-F temperature_inc="0.2" \
-F response_format="json"
+-F temperature="0.2" \
+-F response-format="json"
 ```

 **/load**
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -11,14 +11,13 @@
 #include <thread>
 #include <vector>
 #include <cstring>
-#include <sstream>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif

 using namespace httplib;
-using json = nlohmann::ordered_json;
+using json = nlohmann::json;

 namespace {

@ -40,33 +39,28 @@ struct server_params
 {
    std::string hostname = "127.0.0.1";
    std::string public_path = "examples/server/public";
-    std::string request_path = "";

    int32_t port          = 8080;
    int32_t read_timeout  = 600;
    int32_t write_timeout = 600;
-
-    bool ffmpeg_converter = false;
 };

 struct whisper_params {
-    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors  = 1;
-    int32_t offset_t_ms   = 0;
-    int32_t offset_n      = 0;
-    int32_t duration_ms   = 0;
-    int32_t progress_step = 5;
-    int32_t max_context   = -1;
-    int32_t max_len       = 0;
-    int32_t best_of       = 2;
-    int32_t beam_size     = -1;
-    int32_t audio_ctx     = 0;
+    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_processors =  1;
+    int32_t offset_t_ms  =  0;
+    int32_t offset_n     =  0;
+    int32_t duration_ms  =  0;
+    int32_t progress_step =  5;
+    int32_t max_context  = -1;
+    int32_t max_len      =  0;
+    int32_t best_of      =  2;
+    int32_t beam_size    = -1;

-    float word_thold      =  0.01f;
-    float entropy_thold   =  2.40f;
-    float logprob_thold   = -1.00f;
-    float temperature     =  0.00f;
-    float temperature_inc =  0.20f;
+    float word_thold    =  0.01f;
+    float entropy_thold =  2.40f;
+    float logprob_thold = -1.00f;
+    float userdef_temp  =  0.20f;

    bool speed_up        = false;
    bool debug_mode      = false;
@ -78,7 +72,6 @@ struct whisper_params {
    bool no_fallback     = false;
    bool print_special   = false;
    bool print_colors    = false;
-    bool print_realtime  = false;
    bool print_progress  = false;
    bool no_timestamps   = false;
    bool use_gpu         = true;
@ -123,7 +116,8 @@ bool is_file_exist(const char *fileName)
    return infile.good();
 }

-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params,
+                         const server_params& sparams) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] \n", argv[0]);
    fprintf(stderr, "\n");
@ -139,7 +133,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
-    fprintf(stderr, "  -ac N,     --audio-ctx N       [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
@ -151,7 +144,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -nf,       --no-fallback       [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -pr,       --print-realtime    [%-7s] print output in realtime\n",                       params.print_realtime ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
@ -163,8 +155,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
-    fprintf(stderr, "  --request-path PATH,           [%-7s] Request path for all requests\n", sparams.request_path.c_str());
-    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
    fprintf(stderr, "\n");
 }

@ -185,7 +175,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-ac"   || arg == "--audio-context")   { params.audio_ctx       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
@ -199,7 +188,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (arg == "-fp"   || arg == "--font-path")       { params.font_path       = argv[++i]; }
        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
-        else if (arg == "-pr"   || arg == "--print-realtime")  { params.print_realtime  = true; }
        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
@ -212,8 +200,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
-        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
-        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params, sparams);
@ -231,45 +217,6 @@ struct whisper_print_user_data {
    int progress_prev;
 };

-void check_ffmpeg_availibility() {
-    int result = system("ffmpeg -version");
-
-    if (result == 0) {
-        std::cout << "ffmpeg is available." << std::endl;
-    } else {
-        // ffmpeg is not available
-        std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
-        std::cout << "and that its executable is included in your system's PATH. ";
-        exit(0);
-    }
-}
-
-bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
-    std::ostringstream cmd_stream;
-    std::string converted_filename_temp = temp_filename + "_temp.wav";
-    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
-    std::string cmd = cmd_stream.str();
-
-    int status = std::system(cmd.c_str());
-    if (status != 0) {
-        error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
-        return false;
-    }
-
-    // Remove the original file
-    if (remove(temp_filename.c_str()) != 0) {
-        error_resp = "{\"error\":\"Failed to remove the original file.\"}";
-        return false;
-    }
-
-    // Rename the temporary file to match the original filename
-    if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
-        error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
-        return false;
-    }
-    return true;
-}
-
 std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
    std::string speaker = "";
    const int64_t n_samples = pcmf32s[0].size();
@ -399,106 +346,36 @@ std::string output_str(struct whisper_context * ctx, const whisper_params & para
    return result.str();
 }

-bool parse_str_to_bool(const std::string & s) {
-    if (s == "true" || s == "1" || s == "yes" || s == "y") {
-        return true;
-    }
-    return false;
-}
-
 void get_req_parameters(const Request & req, whisper_params & params)
 {
-    if (req.has_file("offset_t"))
+    // user model configu.has_fileion
+    if (req.has_file("offset-t"))
    {
-        params.offset_t_ms = std::stoi(req.get_file_value("offset_t").content);
+        params.offset_t_ms = std::stoi(req.get_file_value("offset-t").content);
    }
-    if (req.has_file("offset_n"))
+    if (req.has_file("offset-n"))
    {
-        params.offset_n = std::stoi(req.get_file_value("offset_n").content);
+        params.offset_n = std::stoi(req.get_file_value("offset-n").content);
    }
    if (req.has_file("duration"))
    {
        params.duration_ms = std::stoi(req.get_file_value("duration").content);
    }
-    if (req.has_file("max_context"))
+    if (req.has_file("max-context"))
    {
-        params.max_context = std::stoi(req.get_file_value("max_context").content);
-    }
-    if (req.has_file("max_len"))
-    {
-        params.max_len = std::stoi(req.get_file_value("max_len").content);
-    }
-    if (req.has_file("best_of"))
-    {
-        params.best_of = std::stoi(req.get_file_value("best_of").content);
-    }
-    if (req.has_file("beam_size"))
-    {
-        params.beam_size = std::stoi(req.get_file_value("beam_size").content);
-    }
-    if (req.has_file("audio_ctx"))
-    {
-        params.audio_ctx = std::stof(req.get_file_value("audio_ctx").content);
-    }
-    if (req.has_file("word_thold"))
-    {
-        params.word_thold = std::stof(req.get_file_value("word_thold").content);
-    }
-    if (req.has_file("entropy_thold"))
-    {
-        params.entropy_thold = std::stof(req.get_file_value("entropy_thold").content);
-    }
-    if (req.has_file("logprob_thold"))
-    {
-        params.logprob_thold = std::stof(req.get_file_value("logprob_thold").content);
-    }
-    if (req.has_file("debug_mode"))
-    {
-        params.debug_mode = parse_str_to_bool(req.get_file_value("debug_mode").content);
-    }
-    if (req.has_file("translate"))
-    {
-        params.translate = parse_str_to_bool(req.get_file_value("translate").content);
-    }
-    if (req.has_file("diarize"))
-    {
-        params.diarize = parse_str_to_bool(req.get_file_value("diarize").content);
-    }
-    if (req.has_file("tinydiarize"))
-    {
-        params.tinydiarize = parse_str_to_bool(req.get_file_value("tinydiarize").content);
-    }
-    if (req.has_file("split_on_word"))
-    {
-        params.split_on_word = parse_str_to_bool(req.get_file_value("split_on_word").content);
-    }
-    if (req.has_file("no_timestamps"))
-    {
-        params.no_timestamps = parse_str_to_bool(req.get_file_value("no_timestamps").content);
-    }
-    if (req.has_file("language"))
-    {
-        params.language = req.get_file_value("language").content;
-    }
-    if (req.has_file("detect_language"))
-    {
-        params.detect_language = parse_str_to_bool(req.get_file_value("detect_language").content);
+        params.max_context = std::stoi(req.get_file_value("max-context").content);
    }
    if (req.has_file("prompt"))
    {
        params.prompt = req.get_file_value("prompt").content;
    }
-    if (req.has_file("response_format"))
+    if (req.has_file("response-format"))
    {
-        params.response_format = req.get_file_value("response_format").content;
+        params.response_format = req.get_file_value("response-format").content;
    }
-    if (req.has_file("temperature"))
+    if (req.has_file("temerature"))
    {
-        params.temperature = std::stof(req.get_file_value("temperature").content);
-    }
-    if (req.has_file("temperature_inc"))
-    {
-        params.temperature_inc = std::stof(req.get_file_value("temperature_inc").content);
+        params.userdef_temp = std::stof(req.get_file_value("temperature").content);
    }
 }

@ -527,11 +404,8 @@ int main(int argc, char ** argv) {
        exit(0);
    }

-    if (sparams.ffmpeg_converter) {
-        check_ffmpeg_availibility();
-    }
    // whisper init
-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -545,96 +419,18 @@ int main(int argc, char ** argv) {
    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);

    Server svr;
-    svr.set_default_headers({{"Server", "whisper.cpp"},
-                             {"Access-Control-Allow-Origin", "*"},
-                             {"Access-Control-Allow-Headers", "content-type, authorization"}});

-    std::string const default_content = R"(
-    <html>
-    <head>
-        <title>Whisper.cpp Server</title>
-        <meta charset="utf-8">
-        <meta name="viewport" content="width=device-width">
-        <style>
-        body {
-            font-family: sans-serif;
-        }
-        form {
-            display: flex;
-            flex-direction: column;
-            align-items: flex-start;
-        }
-        label {
-            margin-bottom: 0.5rem;
-        }
-        input, select {
-            margin-bottom: 1rem;
-        }
-        button {
-            margin-top: 1rem;
-        }
-        </style>
-    </head>
-    <body>
-        <h1>Whisper.cpp Server</h1>
-
-        <h2>/inference</h2>
-        <pre>
-    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
-    -H "Content-Type: multipart/form-data" \
-    -F file="@&lt;file-path&gt;" \
-    -F temperature="0.0" \
-    -F temperature_inc="0.2" \
-    -F response_format="json"
-        </pre>
-
-        <h2>/load</h2>
-        <pre>
-    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
-    -H "Content-Type: multipart/form-data" \
-    -F model="&lt;path-to-model-file&gt;"
-        </pre>
-
-        <div>
-            <h2>Try it out</h2>
-            <form action="/inference" method="POST" enctype="multipart/form-data">
-                <label for="file">Choose an audio file:</label>
-                <input type="file" id="file" name="file" accept="audio/*" required><br>
-
-                <label for="temperature">Temperature:</label>
-                <input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
-
-                <label for="response_format">Response Format:</label>
-                <select id="response_format" name="response_format">
-                    <option value="verbose_json">Verbose JSON</option>
-                    <option value="json">JSON</option>
-                    <option value="text">Text</option>
-                    <option value="srt">SRT</option>
-                    <option value="vtt">VTT</option>
-                </select><br>
-
-                <button type="submit">Submit</button>
-            </form>
-        </div>
-    </body>
-    </html>
-    )";
-
-    // store default params so we can reset after each inference request
-    whisper_params default_params = params;
+    std::string const default_content = "<html>hello</html>";

    // this is only called if no index.html is found in the public --path
-    svr.Get(sparams.request_path + "/", [&default_content](const Request &, Response &res){
+    svr.Get("/", [&default_content](const Request &, Response &res){
        res.set_content(default_content, "text/html");
        return false;
    });

-    svr.Options(sparams.request_path + "/inference", [&](const Request &req, Response &res){
-    });
-
-    svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
-        // acquire whisper model mutex lock
-        std::lock_guard<std::mutex> lock(whisper_mutex);
+    svr.Post("/inference", [&](const Request &req, Response &res){
+        // aquire whisper model mutex lock
+        whisper_mutex.lock();

        // first check user requested fields of the request
        if (!req.has_file("file"))
@ -642,6 +438,7 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "error: no 'file' field in the request\n");
            const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
            res.set_content(error_resp, "application/json");
+            whisper_mutex.unlock();
            return;
        }
        auto audio_file = req.get_file_value("file");
@ -656,42 +453,20 @@ int main(int argc, char ** argv) {
        std::vector<float> pcmf32;               // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM

-        if (sparams.ffmpeg_converter) {
-            // if file is not wav, convert to wav
-            // write to temporary file
-            const std::string temp_filename = "whisper_server_temp_file.wav";
-            std::ofstream temp_file{temp_filename, std::ios::binary};
-            temp_file << audio_file.content;
-            temp_file.close();
+        // write file to temporary file
+        std::ofstream temp_file{filename, std::ios::binary};
+        temp_file << audio_file.content;

-            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
-            const bool is_converted = convert_to_wav(temp_filename, error_resp);
-            if (!is_converted) {
-                res.set_content(error_resp, "application/json");
-                return;
-            }
-
-            // read wav content into pcmf32
-            if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
-            {
-                fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
-                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-                res.set_content(error_resp, "application/json");
-                std::remove(temp_filename.c_str());
-                return;
-            }
-            // remove temp file
-            std::remove(temp_filename.c_str());
-        } else {
-            if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
-            {
-                fprintf(stderr, "error: failed to read WAV file\n");
-                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-                res.set_content(error_resp, "application/json");
-                return;
-            }
+        // read wav content into pcmf32
+        if (!::read_wav(filename, pcmf32, pcmf32s, params.diarize)) {
+            fprintf(stderr, "error: failed to read WAV file '%s'\n", filename.c_str());
+            const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+            res.set_content(error_resp, "application/json");
+            whisper_mutex.unlock();
+            return;
        }
-
+        // remove temp file
+        std::remove(filename.c_str());

        printf("Successfully loaded %s\n", filename.c_str());

@ -728,6 +503,7 @@ int main(int argc, char ** argv) {

        // run the inference
        {
+
            printf("Running whisper.cpp inference on %s\n", filename.c_str());
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

@ -746,9 +522,7 @@ int main(int argc, char ** argv) {
            wparams.duration_ms      = params.duration_ms;

            wparams.thold_pt         = params.word_thold;
-            wparams.max_len          = params.max_len == 0 ? 60 : params.max_len;
            wparams.split_on_word    = params.split_on_word;
-            wparams.audio_ctx        = params.audio_ctx;

            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;
@ -760,18 +534,14 @@ int main(int argc, char ** argv) {
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;

-            wparams.temperature      = params.temperature;
-            wparams.temperature_inc  = params.temperature_inc;
+            wparams.temperature_inc  = params.userdef_temp;
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;

-            wparams.no_timestamps    = params.no_timestamps;
-            wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
-
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

            // this callback is called on each new segment
-            if (params.print_realtime) {
+            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
                wparams.new_segment_callback_user_data = &user_data;
            }
@ -810,6 +580,7 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                const std::string error_resp = "{\"error\":\"failed to process audio\"}";
                res.set_content(error_resp, "application/json");
+                whisper_mutex.unlock();
                return;
            }
        }
@ -820,103 +591,6 @@ int main(int argc, char ** argv) {
            std::string results = output_str(ctx, params, pcmf32s);
            res.set_content(results.c_str(), "text/html");
        }
-        else if (params.response_format == srt_format)
-        {
-            std::stringstream ss;
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-                std::string speaker = "";
-
-                if (params.diarize && pcmf32s.size() == 2)
-                {
-                    speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-                }
-
-                ss << i + 1 + params.offset_n << "\n";
-                ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
-                ss << speaker << text << "\n\n";
-            }
-            res.set_content(ss.str(), "application/x-subrip");
-        } else if (params.response_format == vtt_format) {
-            std::stringstream ss;
-
-            ss << "WEBVTT\n\n";
-
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-                std::string speaker = "";
-
-                if (params.diarize && pcmf32s.size() == 2)
-                {
-                    speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
-                    speaker.insert(0, "<v Speaker");
-                    speaker.append(">");
-                }
-
-                ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
-                ss << speaker << text << "\n\n";
-            }
-            res.set_content(ss.str(), "text/vtt");
-        } else if (params.response_format == vjson_format) {
-            /* try to match openai/whisper's Python format */
-            std::string results = output_str(ctx, params, pcmf32s);
-            json jres = json{
-                {"task", params.translate ? "translate" : "transcribe"},
-                {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
-                {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
-                {"text", results},
-                {"segments", json::array()}
-            };
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i)
-            {
-                json segment = json{
-                    {"id", i},
-                    {"text", whisper_full_get_segment_text(ctx, i)},
-                };
-
-                if (!params.no_timestamps) {
-                    segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
-                    segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
-                }
-
-                float total_logprob = 0;
-                const int n_tokens = whisper_full_n_tokens(ctx, i);
-                for (int j = 0; j < n_tokens; ++j) {
-                    whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
-                    if (token.id >= whisper_token_eot(ctx)) {
-                        continue;
-                    }
-
-                    segment["tokens"].push_back(token.id);
-                    json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
-                    if (!params.no_timestamps) {
-                        word["start"] = token.t0 * 0.01;
-                        word["end"] = token.t1 * 0.01;
-                    }
-                    word["probability"] = token.p;
-                    total_logprob += token.plog;
-                    segment["words"].push_back(word);
-                }
-
-                segment["temperature"] = params.temperature;
-                segment["avg_logprob"] = total_logprob / n_tokens;
-
-                // TODO compression_ratio and no_speech_prob are not implemented yet
-                // segment["compression_ratio"] = 0;
-                // segment["no_speech_prob"] = 0;
-
-                jres["segments"].push_back(segment);
-            }
-            res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
-                            "application/json");
-        }
        // TODO add more output formats
        else
        {
@ -928,16 +602,17 @@ int main(int argc, char ** argv) {
                            "application/json");
        }

-        // reset params to thier defaults
-        params = default_params;
+        // return whisper model mutex lock
+        whisper_mutex.unlock();
    });
-    svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
-        std::lock_guard<std::mutex> lock(whisper_mutex);
+    svr.Post("/load", [&](const Request &req, Response &res){
+        whisper_mutex.lock();
        if (!req.has_file("model"))
        {
            fprintf(stderr, "error: no 'model' field in the request\n");
            const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
            res.set_content(error_resp, "application/json");
+            whisper_mutex.unlock();
            return;
        }
        std::string model = req.get_file_value("model").content;
@ -946,6 +621,7 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
            const std::string error_resp = "{\"error\":\"model not found!\"}";
            res.set_content(error_resp, "application/json");
+            whisper_mutex.unlock();
            return;
        }

@ -968,6 +644,7 @@ int main(int argc, char ** argv) {
        res.set_content(success, "application/text");

        // check if the model is in the file system
+        whisper_mutex.unlock();
    });

    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
@ -984,11 +661,11 @@ int main(int argc, char ** argv) {
        res.status = 500;
    });

-    svr.set_error_handler([](const Request &req, Response &res) {
+    svr.set_error_handler([](const Request &, Response &res) {
        if (res.status == 400) {
            res.set_content("Invalid request", "text/plain");
        } else if (res.status != 500) {
-            res.set_content("File Not Found (" + req.path + ")", "text/plain");
+            res.set_content("File Not Found", "text/plain");
            res.status = 404;
        }
    });
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -4,7 +4,7 @@ This is a naive example of performing real-time inference on audio from your mic
 The `stream` tool samples the audio every half a second and runs the transcription continously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

-```bash
+```java
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```

@ -14,7 +14,7 @@ https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a

 Setting the `--step` argument to `0` enables the sliding window mode:

-```bash
+```java
 ./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
 ```

@ -39,8 +39,8 @@ brew install sdl2
 make stream
 ```

-Ensure you are at the root of the repo when running `make stream`. Not within the `examples/stream` dir
-as the libraries needed like `common-sdl.h` are located within `examples`. Attempting to compile within
+Ensure you are at the root of the repo when running `make stream`.  Not within the `examples/stream` dir
+as the libraries needed like `common-sdl.h` are located within `examples`.  Attempting to compile within
 `examples/steam` means your compiler cannot find them and it gives an error it cannot find the file.

 ```bash
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
        exit(0);
    }

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,18 +1,25 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp llama.cpp)
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
+    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    #target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

-    if (WHISPER_CLBLAST)
-        set(CLBLAST_LIBNAME clblast)
-    endif ()
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CLBLAST_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
+    # TODO: this is temporary
+    #       need to export ggml symbols for MSVC, but too lazy ..
+    add_executable(${TARGET}
+        talk-llama.cpp
+        llama.cpp
+        ../common.cpp
+        ../common-sdl.cpp
+        ../../ggml.c
+        ../../ggml-alloc.c
+        ../../ggml-backend.c
+        ../../ggml-quants.c
+        ../../whisper.cpp)

-    if(WIN32)
-        # It requires Windows 8.1 or later for PrefetchVirtualMemory
-        target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
-    endif()
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
+    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

    include(DefaultTargetOptions)
 endif ()
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -2,8 +2,12 @@
 #define LLAMA_H

 #include "ggml.h"
-#include "ggml-backend.h"
-
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#else
+#define LLAMA_MAX_DEVICES 1
+#endif // GGML_USE_CUBLAS
 #include <stddef.h>
 #include <stdint.h>
 #include <stdio.h>
@ -35,11 +39,15 @@

 #define LLAMA_MAX_RNG_STATE (64*1024)

-#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'

 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 4
+#define LLAMA_SESSION_VERSION 2
+
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
+#define LLAMA_SUPPORTS_GPU_OFFLOAD
+#endif

 #ifdef __cplusplus
 extern "C" {
@ -61,7 +69,6 @@ extern "C" {
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
    };

    enum llama_token_type {
@ -95,11 +102,6 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors

        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@ -112,12 +114,6 @@ extern "C" {
        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
    };

-    enum llama_split_mode {
-        LLAMA_SPLIT_NONE    = 0, // single GPU
-        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
-    };
-
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@ -130,7 +126,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;

-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef void (*llama_progress_callback)(float progress, void *ctx);

    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -162,46 +158,16 @@ extern "C" {
        llama_seq_id all_seq_id; // used if seq_id == NULL
    } llama_batch;

-    enum llama_model_kv_override_type {
-        LLAMA_KV_OVERRIDE_INT,
-        LLAMA_KV_OVERRIDE_FLOAT,
-        LLAMA_KV_OVERRIDE_BOOL,
-    };
-
-    struct llama_model_kv_override {
-        char key[128];
-        enum llama_model_kv_override_type tag;
-        union {
-            int64_t int_value;
-            double float_value;
-            bool bool_value;
-        };
-    };
-
    struct llama_model_params {
        int32_t n_gpu_layers; // number of layers to store in VRAM
-        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
+        int32_t main_gpu;     // the GPU that is used for scratch and small tensors
+        const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)

-        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_LAYER: ignored
-        int32_t main_gpu;
-
-        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
-        const float * tensor_split;
-
-        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
-        // If the provided progress_callback returns true, model loading continues.
-        // If it returns false, model loading is immediately aborted.
+        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
-
        // context pointer passed to the progress callback
        void * progress_callback_user_data;

-        // override key-value pairs of the model meta data
-        const struct llama_model_kv_override * kv_overrides;
-
        // Keep the booleans together to avoid misalignment during copy-by-value.
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mmap;   // use mmap if possible
@ -214,39 +180,32 @@ extern "C" {
        uint32_t n_batch;           // prompt processing maximum batch size
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
-        int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+        int8_t   rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`

        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
        float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
-        float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
+        float    yarn_ext_factor;  // YaRN extrapolation mix factor, NaN = from model
        float    yarn_attn_factor; // YaRN magnitude scaling factor
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size

-        ggml_backend_sched_eval_callback cb_eval;
-        void * cb_eval_user_data;
-
-        enum ggml_type type_k; // data type for K cache
-        enum ggml_type type_v; // data type for V cache
-
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
-        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool embedding;   // embedding mode only
-        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+        bool mul_mat_q;  // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool f16_kv;     // use fp16 for KV cache, fp32 otherwise
+        bool logits_all; // the llama_eval() call computes all logits, not just the last one
+        bool embedding;  // embedding mode only
    };

    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        int nthread;                 // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;      // quantize to this llama_ftype
        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor; // quantize output.weight
        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
-        void * imatrix;              // pointer to importance matrix data
    } llama_model_quantize_params;

    // grammar types
@ -325,48 +284,25 @@ extern "C" {

    LLAMA_API int64_t llama_time_us(void);

-    LLAMA_API size_t llama_max_devices(void);
-
-    LLAMA_API bool llama_supports_mmap       (void);
-    LLAMA_API bool llama_supports_mlock      (void);
-    LLAMA_API bool llama_supports_gpu_offload(void);
-
-    LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
-    LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
+    LLAMA_API int  llama_max_devices    (void);
+    LLAMA_API bool llama_mmap_supported (void);
+    LLAMA_API bool llama_mlock_supported(void);

    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);

-    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
-    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx      (const struct llama_context * ctx);

    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);

-    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
-    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
-    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
+    LLAMA_API int llama_n_vocab    (const struct llama_model * model);
+    LLAMA_API int llama_n_ctx_train(const struct llama_model * model);
+    LLAMA_API int llama_n_embd     (const struct llama_model * model);

    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);

-    // Functions to access the model's GGUF metadata scalar values
-    // - The functions return the length of the string on success, or -1 on failure
-    // - The output string is always null-terminated and cleared on failure
-    // - GGUF array values are not supported by these functions
-
-    // Get metadata value as a string by key name
-    LLAMA_API int32_t llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
-
-    // Get the number of metadata key/value pairs
-    LLAMA_API int32_t llama_model_meta_count(const struct llama_model * model);
-
-    // Get metadata key name by index
-    LLAMA_API int32_t llama_model_meta_key_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
-    // Get metadata value as a string by index
-    LLAMA_API int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int32_t i, char * buf, size_t buf_size);
-
    // Get a string describing the model type
-    LLAMA_API int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
+    LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);

    // Returns the total size of all the tensors in the model in bytes
    LLAMA_API uint64_t llama_model_size(const struct llama_model * model);
@ -378,7 +314,7 @@ extern "C" {
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);

    // Returns 0 on success
-    LLAMA_API uint32_t llama_model_quantize(
+    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
            const llama_model_quantize_params * params);
@ -389,79 +325,28 @@ extern "C" {
    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
    // will be applied on top of the previous one
    // Returns 0 on success
-    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
+    LLAMA_API DEPRECATED(int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
-                         int32_t   n_threads),
+                             int   n_threads),
            "use llama_model_apply_lora_from_file instead");

-    LLAMA_API int32_t llama_model_apply_lora_from_file(
+    LLAMA_API int llama_model_apply_lora_from_file(
            const struct llama_model * model,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
-                         int32_t   n_threads);
+                             int   n_threads);

    //
    // KV cache
    //

-    // Information associated with an individual cell in the KV cache view.
-    struct llama_kv_cache_view_cell {
-        // The position for this cell. Takes KV cache shifts into account.
-        // May be negative if the cell is not populated.
-        llama_pos pos;
-    };
-
-    // An updateable view of the KV cache.
-    struct llama_kv_cache_view {
-        // Number of KV cache cells. This will be the same as the context size.
-        int32_t n_cells;
-
-        // Maximum number of sequences that can exist in a cell. It's not an error
-        // if there are more sequences in a cell than this value, however they will
-        // not be visible in the view cells_sequences.
-        int32_t n_max_seq;
-
-        // Number of tokens in the cache. For example, if there are two populated
-        // cells, the first with 1 sequence id in it and the second with 2 sequence
-        // ids then you'll have 3 tokens.
-        int32_t token_count;
-
-        // Number of populated cache cells.
-        int32_t used_cells;
-
-        // Maximum contiguous empty slots in the cache.
-        int32_t max_contiguous;
-
-        // Index to the start of the max_contiguous slot range. Can be negative
-        // when cache is full.
-        int32_t max_contiguous_idx;
-
-        // Information for an individual cell.
-        struct llama_kv_cache_view_cell * cells;
-
-        // The sequences for each cell. There will be n_max_seq items per cell.
-        llama_seq_id * cells_sequences;
-    };
-
-    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
-
-    // Free a KV cache view. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-
-    // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
-    LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-
-    // Returns the number of tokens in the KV cache (slow, use only for debug)
-    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
-
-    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    // Returns the number of tokens in the KV cache
+    LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
+            "avoid using this, it will be removed in the future, instead - count the tokens in user code");

    // Clear the KV cache
    LLAMA_API void llama_kv_cache_clear(
@ -504,17 +389,6 @@ extern "C" {
                       llama_pos   p1,
                       llama_pos   delta);

-    // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly
-    // p0 < 0 : [0,  p1]
-    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
-            struct llama_context * ctx,
-                    llama_seq_id   seq_id,
-                       llama_pos   p0,
-                       llama_pos   p1,
-                             int   d);
-
    //
    // State / sessions
    //
@ -563,7 +437,7 @@ extern "C" {
            struct llama_context * ctx,
                     llama_token * tokens,
                         int32_t   n_tokens,
-                         int32_t   n_past),
+                             int   n_past),
            "use llama_decode() instead");

    // Same as llama_eval, but use float matrix input directly.
@ -572,7 +446,7 @@ extern "C" {
            struct llama_context * ctx,
                           float * embd,
                         int32_t   n_tokens,
-                         int32_t   n_past),
+                             int   n_past),
            "use llama_decode() instead");

    // Return batch for single sequence of tokens starting at pos_0
@ -604,7 +478,7 @@ extern "C" {
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
    // < 0 - error
-    LLAMA_API int32_t llama_decode(
+    LLAMA_API int llama_decode(
            struct llama_context * ctx,
              struct llama_batch   batch);

@ -643,12 +517,6 @@ extern "C" {
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line

-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
-
-    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
-
    // codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
@ -665,12 +533,12 @@ extern "C" {
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
    ///                Does not insert a leading space.
-    LLAMA_API int32_t llama_tokenize(
+    LLAMA_API int llama_tokenize(
        const struct llama_model * model,
                      const char * text,
-                         int32_t   text_len,
+                             int   text_len,
                     llama_token * tokens,
-                         int32_t   n_max_tokens,
+                             int   n_max_tokens,
                            bool   add_bos,
                            bool   special);

@ -678,11 +546,11 @@ extern "C" {
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
-    LLAMA_API int32_t llama_token_to_piece(
+    LLAMA_API int llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
-                               int32_t   length);
+                                  int    length);

    //
    // Grammar
@ -716,21 +584,14 @@ extern "C" {
                           float   penalty_present);

    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-    /// @param logits Logits extracted from the original generation context.
-    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API void llama_sample_apply_guidance(
-              struct llama_context * ctx,
-                             float * logits,
-                             float * logits_guidance,
-                             float   scale);
-
-    LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+    /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void llama_sample_classifier_free_guidance(
              struct llama_context * ctx,
            llama_token_data_array * candidates,
              struct llama_context * guidance_ctx,
-                             float   scale),
-              "use llama_sample_apply_guidance() instead");
+                             float   scale);

    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    LLAMA_API void llama_sample_softmax(
@ -741,7 +602,7 @@ extern "C" {
    LLAMA_API void llama_sample_top_k(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
-                         int32_t   k,
+                             int   k,
                          size_t   min_keep);

    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
@ -772,14 +633,6 @@ extern "C" {
                           float   p,
                          size_t   min_keep);

-    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API void llama_sample_entropy(
-            struct llama_context * ctx,
-          llama_token_data_array * candidates_p,
-                           float   min_temp,
-                           float   max_temp,
-                           float   exponent_val);
-
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
@ -808,7 +661,7 @@ extern "C" {
          llama_token_data_array * candidates,
                           float   tau,
                           float   eta,
-                         int32_t   m,
+                             int   m,
                           float * mu);

    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
@ -881,8 +734,8 @@ extern "C" {
        llama_beam_search_callback_fn_t   callback,
                                   void * callback_data,
                                 size_t   n_beams,
-                                int32_t   n_past,
-                                int32_t   n_predict);
+                                    int   n_past,
+                                    int   n_predict);

    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
--- a/examples/talk-llama/speak
+++ b/examples/talk-llama/speak
@ -9,14 +9,6 @@
 #
 #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"

-# piper
-#
-# https://github.com/rhasspy/piper
-#
-# Tested with Linux:
-#
-#echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
-
 # for Mac
 say "$2"

--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -14,7 +14,6 @@
 #include <thread>
 #include <vector>
 #include <regex>
-#include <sstream>

 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    auto * model = llama_get_model(ctx);
@ -68,9 +67,6 @@ struct whisper_params {
    bool use_gpu        = true;

    std::string person      = "Georgi";
-    std::string bot_name    = "LLaMA";
-    std::string wake_cmd    = "";
-    std::string heard_ok    = "";
    std::string language    = "en";
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
@ -105,10 +101,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
-        else if (arg == "-bn"   || arg == "--bot-name")      { params.bot_name       = argv[++i]; }
-        else if (arg == "--session")                         { params.path_session   = argv[++i]; }
-        else if (arg == "-w"   || arg == "--wake-command")   { params.wake_cmd       = argv[++i]; }
-        else if (arg == "-ho"  || arg == "--heard-ok")       { params.heard_ok       = argv[++i]; }
+        else if (arg == "--session")                         { params.path_session   = argv[++i];}
        else if (arg == "-l"   || arg == "--language")       { params.language       = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
@ -153,9 +146,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-    fprintf(stderr, "  -bn NAME, --bot-name NAME  [%-7s] bot name (to display)\n",                       params.bot_name.c_str());
-    fprintf(stderr, "  -w TEXT,  --wake-command T [%-7s] wake-up command to listen for\n",               params.wake_cmd.c_str());
-    fprintf(stderr, "  -ho TEXT, --heard-ok TEXT  [%-7s] said by TTS before generating reply\n",         params.heard_ok.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
@ -234,18 +224,6 @@ std::string transcribe(
    return result;
 }

-std::vector<std::string> get_words(const std::string &txt) {
-    std::vector<std::string> words;
-
-    std::istringstream iss(txt);
-    std::string word;
-    while (iss >> word) {
-        words.push_back(word);
-    }
-
-    return words;
-}
-
 const std::string k_prompt_whisper = R"(A conversation with a person called {1}.)";

 const std::string k_prompt_llama = R"(Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
@ -281,7 +259,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
@ -304,6 +282,7 @@ int main(int argc, char ** argv) {
    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
+    lcparams.f16_kv     = true;
    lcparams.n_threads  = params.n_threads;

    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
@ -345,11 +324,12 @@ int main(int argc, char ** argv) {
    float prob0 = 0.0f;

    const std::string chat_symb = ":";
+    const std::string bot_name  = "LLaMA";

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", params.bot_name);
+    const std::string prompt_whisper = ::replace(k_prompt_whisper, "{1}", bot_name);

    // construct the initial prompt for LLaMA inference
    std::string prompt_llama = params.prompt.empty() ? k_prompt_llama : params.prompt;
@ -358,7 +338,7 @@ int main(int argc, char ** argv) {
    prompt_llama.insert(0, 1, ' ');

    prompt_llama = ::replace(prompt_llama, "{0}", params.person);
-    prompt_llama = ::replace(prompt_llama, "{1}", params.bot_name);
+    prompt_llama = ::replace(prompt_llama, "{1}", bot_name);

    {
        // get time string
@ -460,16 +440,6 @@ int main(int argc, char ** argv) {
    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);

    printf("%s : done! start speaking in the microphone\n", __func__);
-
-    // show wake command if enabled
-    const std::string wake_cmd = params.wake_cmd;
-    const int wake_cmd_length = get_words(wake_cmd).size();
-    const bool use_wake_cmd = wake_cmd_length > 0;
-
-    if (use_wake_cmd) {
-        printf("%s : the wake-up command is: '%s%s%s'\n", __func__, "\033[1m", wake_cmd.c_str(), "\033[0m");
-    }
-
    printf("\n");
    printf("%s%s", params.person.c_str(), chat_symb.c_str());
    fflush(stdout);
@ -515,41 +485,10 @@ int main(int argc, char ** argv) {

                audio.get(params.voice_ms, pcmf32_cur);

-                std::string all_heard;
-
-                if (!force_speak) {
-                    all_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
-                }
-
-                const auto words = get_words(all_heard);
-
-                std::string wake_cmd_heard;
                std::string text_heard;

-                for (int i = 0; i < (int) words.size(); ++i) {
-                    if (i < wake_cmd_length) {
-                        wake_cmd_heard += words[i] + " ";
-                    } else {
-                        text_heard += words[i] + " ";
-                    }
-                }
-
-                // check if audio starts with the wake-up command if enabled
-                if (use_wake_cmd) {
-                    const float sim = similarity(wake_cmd_heard, wake_cmd);
-
-                    if ((sim < 0.7f) || (text_heard.empty())) {
-                        audio.clear();
-                        continue;
-                    }
-                }
-
-                // optionally give audio feedback that the current text is being processed
-                if (!params.heard_ok.empty()) {
-                    int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + params.heard_ok + "'").c_str());
-                    if (ret != 0) {
-                        fprintf(stderr, "%s: failed to speak\n", __func__);
-                    }
+                if (!force_speak) {
+                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
                }

                // remove text between brackets using regex
@ -586,7 +525,7 @@ int main(int argc, char ** argv) {
                force_speak = false;

                text_heard.insert(0, 1, ' ');
-                text_heard += "\n" + params.bot_name + chat_symb;
+                text_heard += "\n" + bot_name + chat_symb;
                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
                fflush(stdout);

@ -719,7 +658,6 @@ int main(int argc, char ** argv) {
                            text_to_speak += llama_token_to_piece(ctx_llama, id);

                            printf("%s", llama_token_to_piece(ctx_llama, id).c_str());
-                            fflush(stdout);
                        }
                    }

--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@ -2,9 +2,8 @@

 #include <cassert>
 #include <stdexcept>
-#include <string>
-#include <unordered_map>
 #include <vector>
+#include <unordered_map>

 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
-        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -524,7 +524,8 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
-        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // wte
-        ctx_size +=   n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
-        ctx_size += n_vocab*ggml_row_size(wtype, n_embd);         // lm_head
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd)); // c_attn_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));        // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
-        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32,   n_embd));        // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
-        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -525,7 +525,8 @@ bool gpt2_eval(
            struct ggml_tensor * KQ_scaled =
                ggml_scale(ctx0,
                        KQ,
-                        1.0f/sqrt(float(n_embd)/n_head));
+                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
+                        );

            // KQ_masked = mask_past(KQ_scaled)
            // [n_past + N, N, 12]
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -184,7 +184,7 @@ int main(int argc, char ** argv) {
    }

    // whisper init
-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
--- a/examples/wchess/README.md
+++ b/examples/wchess/README.md
@ -1,45 +0,0 @@
-# wchess
-
-Voice-controlled chess using Whisper
-
-Online demo: https://whisper.ggerganov.com/wchess/
-
-https://github.com/ggerganov/whisper.cpp/assets/1991296/c2b2f03c-9684-49f3-8106-357d2d4e67fa
-
-## Command-line tool
-
-```bash
-mkdir build && cd build
-cmake -DWHISPER_SDL2=1 ..
-make -j
-
-./bin/wchess -m ../models/ggml-base.en.bin
-
-Move: start
-
-a b c d e f g h
-r n b q k b n r 8
-p p p p p p p p 7
-. * . * . * . * 6
-* . * . * . * . 5
-. * . * . * . * 4
-* . * . * . * . 3
-P P P P P P P P 2
-R N B Q K B N R 1
-
-White's turn
-[(l)isten/(p)ause/(q)uit]: 
-```
-
-## TODO
-
- Fix bugs in the chess moves logic
- Improve web-browser audio capture - sometimes it does not record the voice properly
- Add support for more languages by making the generated grammar string multilingual
- Explore ways to improve the dynamic grammar to be narrower
-
-PRs welcome!
-
-## Thanks
-
- [chessboardjs](https://chessboardjs.com) for the neat chessboard JS library used in this demo
--- a/examples/wchess/libwchess/CMakeLists.txt
+++ b/examples/wchess/libwchess/CMakeLists.txt
@ -1,19 +1,19 @@
-add_library(wchess-core STATIC
+add_library(libwchess
    WChess.cpp
    WChess.h
    Chessboard.cpp
    Chessboard.h
 )

-target_link_libraries(wchess-core
+target_link_libraries(libwchess
    PUBLIC
    whisper
    common
 )

-target_include_directories(wchess-core
+target_include_directories(libwchess
    PUBLIC
    "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
 )

-# add_executable(test-chessboard test-chessboard.cpp Chessboard.cpp)
+add_executable(test-chessboard test-chessboard.cpp Chessboard.cpp)
--- a/examples/wchess/libwchess/Chessboard.cpp
+++ b/examples/wchess/libwchess/Chessboard.cpp
--- a/examples/wchess/libwchess/Chessboard.h
+++ b/examples/wchess/libwchess/Chessboard.h
@ -1,33 +1,59 @@
 #pragma once
 #include <string>
-#include <set>
-#include <memory>
+#include <array>
+#include <vector>

-// just basic validation
-// fixme: missing en passant, castling, promotion, etc.
-struct State;
-class Piece;
 class Chessboard {
 public:
    Chessboard();
-    ~Chessboard();
-    std::string process(const std::string& command);
+    std::string process(const std::string& t);
    std::string stringifyBoard();
-    const std::string& grammar() { return m_grammar; }
-    const std::string& prompt() { return m_prompt; }
-    void setPrompt(const std::string& prompt);
+    std::string getRules(const std::string & prompt) const;
+    using Move = std::pair<int, int>;
 private:
-    bool parseCommand(const std::string& command, Piece*& piece, char& pos_to);
-    bool move(Piece& piece, char pos);
-    void flagUpdates(char pos_from, char pos_to);
-    void updatePins(Piece& piece);
-    void detectChecks();
-    void setGrammar();
+    bool move(const Move& move);

-    std::unique_ptr<State> m_state;
-    std::set<char> m_allowedInCheck;
-    bool m_inCheck = false;
+    struct Piece {
+        enum Types {
+            Pawn,
+            Knight,
+            Bishop,
+            Rook,
+            Queen,
+            King,
+            Taken,
+        };
+
+        enum Colors {
+            White,
+            Black,
+        };
+
+        Types type;
+        Colors color;
+        int pos;
+    };
+
+    using PieceSet = std::array<Piece, 16>;
+
+    PieceSet blackPieces;
+    PieceSet whitePieces;
    int m_moveCounter = 0;
-    std::string m_grammar;
-    std::string m_prompt;
+
+    using Board = std::array<Piece*, 64>;
+    Board board;
+
+    std::vector<Move> whiteMoves;
+    std::vector<Move> blackMoves;
+
+    bool validateMove(const Piece& piece, int pos);
+    void getValidMoves(const Piece& piece, std::vector<Move>& moves);
+    // just basic validation
+    // fixme: missing en passant, castling, promotion, etc.
+    bool validatePawnMove(Piece::Colors color, int from_rank, int from_file, int to_rank, int to_file);
+    bool validateKnightMove(Piece::Colors color, int from_rank, int from_file, int to_rank, int to_file);
+    bool validateBishopMove(Piece::Colors color, int from_rank, int from_file, int to_rank, int to_file);
+    bool validateRookMove(Piece::Colors color, int from_rank, int from_file, int to_rank, int to_file);
+    bool validateQueenMove(Piece::Colors color, int from_rank, int from_file, int to_rank, int to_file);
+    bool validateKingMove(Piece::Colors color, int from_rank, int from_file, int to_rank, int to_file);
 };
--- a/examples/wchess/libwchess/WChess.cpp
+++ b/examples/wchess/libwchess/WChess.cpp
@ -17,136 +17,163 @@ WChess::WChess(whisper_context * ctx,

 WChess::~WChess() = default;

-void WChess::set_move(const std::string& moves, float prob) const {
-    if (m_cb.set_move) (*m_cb.set_move)(moves, prob);
+void WChess::set_status(const std::string& msg) const {
+    if (m_cb.set_status) (*m_cb.set_status)(msg);
 }

-void WChess::set_grammar(const std::string& grammar) const {
-    if (m_cb.set_grammar) (*m_cb.set_grammar)(grammar);
+void WChess::set_moves(const std::string& moves) const {
+    if (m_cb.set_moves) (*m_cb.set_moves)(moves);
 }

-bool WChess::get_audio(std::vector<float>& pcmf32) const {
-    if (m_cb.get_audio) return (*m_cb.get_audio)(pcmf32);
+bool WChess::check_running() const {
+    if (m_cb.check_running) return (*m_cb.check_running)();
    return false;
 }

+void WChess::clear_audio() const {
+    if (m_cb.clear_audio) (*m_cb.clear_audio)();
+}
+
+void WChess::get_audio(int ms, std::vector<float>& pcmf32) const {
+    if (m_cb.get_audio) (*m_cb.get_audio)(ms, pcmf32);
+}
+
 std::string WChess::stringify_board() const {
    return m_board->stringifyBoard();
 }

-std::string WChess::get_grammar() const {
-    return m_board->grammar();
-}
-
 void WChess::run() {
+    set_status("loading data ...");
+
    bool have_prompt  = true;
    bool ask_prompt   = !have_prompt;

+    float logprob_min0 = 0.0f;
    float logprob_min  = 0.0f;

+    float logprob_sum0 = 0.0f;
    float logprob_sum  = 0.0f;

+    int n_tokens0 = 0;
    int n_tokens  = 0;

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    const std::string k_prompt = have_prompt ? "" : "rook to d4, f3";
-    int64_t t_ms = 0;
+    const std::string k_prompt = have_prompt ? "" : "checkmate";

-    if (ask_prompt) {
-        fprintf(stdout, "\n");
-        fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
-        fprintf(stdout, "\n");
-
-        ask_prompt = false;
-    }
-
-    while (get_audio(pcmf32_cur)) {
-        if (!pcmf32_cur.empty()) {
-            // fprintf(stdout, "%s: Processing ...\n", __func__);
-
-            if (!have_prompt) {
-                const auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
-
-                fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
-
-                const float sim = similarity(txt, k_prompt);
-
-                if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
-                    fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
-                    ask_prompt = true;
-                } else {
-                    fprintf(stdout, "\n");
-                    fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
-                    fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
-                    fprintf(stdout, "\n");
-
-                    // save the audio for the prompt
-                    pcmf32_prompt = pcmf32_cur;
-                    have_prompt = true;
-                    m_board->setPrompt(k_prompt);
-                }
-            } else {
-                if (!pcmf32_prompt.empty()) pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
-                constexpr size_t MIN_SIZE = 1.2 * WHISPER_SAMPLE_RATE;
-                if (MIN_SIZE > pcmf32_cur.size()) pcmf32_cur.insert(pcmf32_cur.begin(), MIN_SIZE - pcmf32_cur.size(), 0.0f);
-
-                // fprintf(stdout, "%s: grammar rules:\n'%s'\n", __func__, m_board->grammar().c_str());
-
-                auto grammar_parsed = grammar_parser::parse(m_board->grammar().c_str());
-                auto grammar_rules  = grammar_parsed.c_rules();
-
-                m_wparams.grammar_rules   = grammar_rules.data();
-                m_wparams.n_grammar_rules = grammar_rules.size();
-
-                m_wparams.i_start_rule    = grammar_parsed.symbol_ids.at("move");
-                auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
-
-                const float p = 100.0f * std::exp(logprob_min);
-
-                fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
-
-                // find the prompt in the text
-                float best_sim = 0.0f;
-                size_t best_len = 0;
-                for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
-                    const auto prompt = txt.substr(0, n);
-
-                    const float sim = similarity(prompt, k_prompt);
-
-                    //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
-
-                    if (sim > best_sim) {
-                        best_sim = sim;
-                        best_len = n;
-                    }
-                }
-
-                fprintf(stdout, "%s:   DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
-                std::string command = ::trim(txt.substr(best_len));
-
-                fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
-                fprintf(stdout, "\n");
-
-                if (!command.empty()) {
-                    set_move(m_board->process(command), p);
-                    set_grammar(m_board->grammar());
-                }
-                if (m_board->grammar().empty()) {
-                    fprintf(stdout, "%s: No more moves possible\n", __func__);
-                    break;
-                }
-            }
-        }
+    while (check_running()) {
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));

        if (ask_prompt) {
            fprintf(stdout, "\n");
            fprintf(stdout, "%s: Say the following phrase: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
            fprintf(stdout, "\n");

+            {
+                char txt[1024];
+                snprintf(txt, sizeof(txt), "Say the following phrase: '%s'", k_prompt.c_str());
+                set_status(txt);
+            }
+
            ask_prompt = false;
        }
+
+        int64_t t_ms = 0;
+
+        {
+            get_audio(m_settings.vad_ms, pcmf32_cur);
+
+            if (!pcmf32_cur.empty()) {
+                fprintf(stdout, "%s: Processing ...\n", __func__);
+                set_status("Processing ...");
+
+                if (!have_prompt) {
+                    const auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
+
+                    fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", txt.c_str(), "\033[0m", (int) t_ms);
+
+                    const float sim = similarity(txt, k_prompt);
+
+                    if (txt.length() < 0.8*k_prompt.length() || txt.length() > 1.2*k_prompt.length() || sim < 0.8f) {
+                        fprintf(stdout, "%s: WARNING: prompt not recognized, try again\n", __func__);
+                        ask_prompt = true;
+                    } else {
+                        fprintf(stdout, "\n");
+                        fprintf(stdout, "%s: The prompt has been recognized!\n", __func__);
+                        fprintf(stdout, "%s: Waiting for voice commands ...\n", __func__);
+                        fprintf(stdout, "\n");
+
+                        {
+                            char txt[1024];
+                            snprintf(txt, sizeof(txt), "Success! Waiting for voice commands ...");
+                            set_status(txt);
+                        }
+
+                        // save the audio for the prompt
+                        pcmf32_prompt = pcmf32_cur;
+                        have_prompt = true;
+                    }
+                } else {
+                    if (!pcmf32_prompt.empty()) pcmf32_cur.insert(pcmf32_cur.begin(), pcmf32_prompt.begin(), pcmf32_prompt.end());
+                    static const size_t MIN_SIZE = 1.2 * WHISPER_SAMPLE_RATE;
+                    if (MIN_SIZE > pcmf32_cur.size()) pcmf32_cur.insert(pcmf32_cur.begin(), MIN_SIZE - pcmf32_cur.size(), 0.0f);
+
+                    std::string rules = m_board->getRules(k_prompt);
+                    fprintf(stdout, "%s: grammar rules:\n'%s'\n", __func__, rules.c_str());
+
+                    auto grammar_parsed = grammar_parser::parse(rules.c_str());
+                    auto grammar_rules = grammar_parsed.c_rules();
+
+                    m_wparams.grammar_rules   = grammar_rules.data();
+                    m_wparams.n_grammar_rules = grammar_rules.size();
+
+                    m_wparams.i_start_rule    = grammar_parsed.symbol_ids.at("move");
+                    auto txt = ::trim(transcribe(pcmf32_cur, logprob_min, logprob_sum, n_tokens, t_ms));
+
+                    const float p = 100.0f * std::exp(logprob_min);
+
+                    fprintf(stdout, "%s: heard '%s'\n", __func__, txt.c_str());
+
+                    // find the prompt in the text
+                    float best_sim = 0.0f;
+                    size_t best_len = 0;
+                    for (int n = 0.8*k_prompt.size(); n <= 1.2*k_prompt.size(); ++n) {
+                        const auto prompt = txt.substr(0, n);
+
+                        const float sim = similarity(prompt, k_prompt);
+
+                        //fprintf(stderr, "%s: prompt = '%s', sim = %f\n", __func__, prompt.c_str(), sim);
+
+                        if (sim > best_sim) {
+                            best_sim = sim;
+                            best_len = n;
+                        }
+                    }
+
+                    fprintf(stdout, "%s:   DEBUG: txt = '%s', prob = %.2f%%\n", __func__, txt.c_str(), p);
+                    std::string command = ::trim(txt.substr(best_len));
+
+                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                    fprintf(stdout, "\n");
+
+                    {
+                        char txt[1024];
+                        snprintf(txt, sizeof(txt), "Command '%s', (t = %d ms)", command.c_str(), (int) t_ms);
+                        set_status(txt);
+                    }
+                    if (!command.empty()) {
+                        auto move = m_board->process(command);
+                        if (!move.empty()) {
+                            set_moves(std::move(move));
+                        }
+                    }
+                }
+
+                clear_audio();
+            }
+        }
    }
 }

--- a/examples/wchess/libwchess/WChess.h
+++ b/examples/wchess/libwchess/WChess.h
@ -8,16 +8,18 @@ class Chessboard;

 class WChess {
 public:
+    using SetStatusCb = void (*)(const std::string &);
    using CheckRunningCb = bool (*)();
-    using GetAudioCb = bool (*)(std::vector<float> &);
-    using SetMovesCb = void (*)(const std::string &, float);
-    using SetGrammarCb = void (*)(const std::string &);
+    using GetAudioCb = void (*)(int, std::vector<float> &);
+    using SetMovesCb = void (*)(const std::string &);
    using ClearAudioCb = void (*)();

    struct callbacks {
+        SetStatusCb set_status = nullptr;
+        CheckRunningCb check_running = nullptr;
        GetAudioCb get_audio = nullptr;
-        SetMovesCb set_move = nullptr;
-        SetGrammarCb set_grammar = nullptr;
+        SetMovesCb set_moves = nullptr;
+        ClearAudioCb clear_audio = nullptr;
    };

    struct settings {
@ -38,16 +40,13 @@ public:
    ~WChess();

    void run();
-
    std::string stringify_board() const;
-
-    std::string get_grammar() const;
-
 private:
-    bool get_audio(std::vector<float>& pcmf32) const;
-    void set_move(const std::string& moves, float prob) const;
-    void set_grammar(const std::string& grammar) const;
-
+    void get_audio(int ms, std::vector<float>& pcmf32) const;
+    void set_status(const std::string& msg) const;
+    void set_moves(const std::string& moves) const;
+    bool check_running() const;
+    void clear_audio() const;
    std::string transcribe(
                    const std::vector<float> & pcmf32,
                    float & logprob_min,
--- a/examples/wchess/libwchess/test-chessboard.cpp
+++ b/examples/wchess/libwchess/test-chessboard.cpp
@ -11,107 +11,78 @@


 int main() {
-    {
-        Chessboard chess;
-
-        ASSERT(chess.process("pawn to d4") == "d2-d4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("c1 h6") == "c1-h6");
-        ASSERT(chess.process("queen h4") == "d8-h4");
-        ASSERT(chess.process("bishop to g5") == "h6-g5");
-        ASSERT(chess.process("bishop to b4") == "f8-b4");
-        ASSERT(chess.process("c4") == "");
-        ASSERT(chess.process("knight c3") == "b1-c3");
-        ASSERT(chess.process("knight c6") == "b8-c6");
-        ASSERT(chess.process("f3") == "");
-    }

    {
+        // pawns
        Chessboard chess;

-        ASSERT(chess.process("d4") == "d2-d4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("e4") == "e2-e4");
-        ASSERT(chess.process("queen h4") == "d8-h4");
-        ASSERT(chess.process("queen h5") == "d1-h5");
-        ASSERT(chess.process("f5") == "");
-        ASSERT(chess.process("g6") == "g7-g6");
-        ASSERT(chess.process("knight e2") == "g1-e2");
-        ASSERT(chess.process("f5") == "f7-f5");
-        ASSERT(chess.process("knight g3") == "e2-g3");
-        ASSERT(chess.process("g5") == "");
-        ASSERT(chess.process("king e7") == "e8-e7");
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("g5") == "g6-g5");
-    }
-
-    {
-        Chessboard chess;
-
-        ASSERT(chess.process("e4") == "e2-e4");
-        ASSERT(chess.process("c5") == "c7-c5");
-        ASSERT(chess.process("e5") == "e4-e5");
-        ASSERT(chess.process("c4") == "c5-c4");
-        ASSERT(chess.process("e6") == "e5-e6");
-        ASSERT(chess.process("c3") == "c4-c3");
-        ASSERT(chess.process("e7") == "");
-        ASSERT(chess.process("f7") == "e6-f7");
-        ASSERT(chess.process("d2") == "");
-        ASSERT(chess.process("king to f7") == "e8-f7");
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("d2") == "c3-d2");
-        ASSERT(chess.process("f5") == "");
-        ASSERT(chess.process("king to e2") == "e1-e2");
-        ASSERT(chess.process("king to g6") == "f7-g6");
-        ASSERT(chess.process("f5") == "f4-f5");
-        ASSERT(chess.process("e6") == "");
-        ASSERT(chess.process("king to h5") == "g6-h5");
-        ASSERT(chess.process("g4") == "g2-g4");
-        ASSERT(chess.process("king to g5") == "h5-g5");
+        ASSERT(chess.process("pawn to d4, e5, e3, pawn to d5") == "d2-d4 e7-e5 e2-e3 d7-d5");
+        ASSERT(chess.process("pawn to d4") == ""); // wrong
+        ASSERT(chess.process("pawn to c5") == ""); // wrong
+        ASSERT(chess.process("pawn to d5") == ""); // wrong
+        ASSERT(chess.process("pawn to d3") == ""); // wrong
+        ASSERT(chess.process("pawn to f5") == ""); // wrong, white's turn
        ASSERT(chess.process("h4") == "h2-h4");
-        ASSERT(chess.process("king to h5") == "");
-        ASSERT(chess.process("king to g6") == "");
-        ASSERT(chess.process("king to h6") == "g5-h6");
-        ASSERT(chess.process("bishop to d2") == "c1-d2");
-        ASSERT(chess.process("king to g5") == "");
-        ASSERT(chess.process("g5") == "g7-g5");
+        ASSERT(chess.process("d4") == "e5-d4");
+        ASSERT(chess.process("e4") == "e3-e4");
+        ASSERT(chess.process("d4") == ""); // wrong
+        ASSERT(chess.process("e4") == "d5-e4");
    }

    {
+        // rook
        Chessboard chess;
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("g4") == "g2-g4");
-        ASSERT(chess.process("queen to h4") == "d8-h4#");
-        ASSERT(chess.process("knight f3") == "");
-        ASSERT(chess.grammar().empty());
+
+        ASSERT(chess.process("rook to a3") == ""); // wrong
+        ASSERT(chess.process("a4, h5, rook to a3, rook to h6") == "a2-a4 h7-h5 a1-a3 h8-h6");
+        ASSERT(chess.process("rook to d3, rook to e6") == "a3-d3 h6-e6");
+        ASSERT(chess.process("rook to d4, rook to e5") == "d3-d4 e6-e5");
+        ASSERT(chess.process("rook to a4") == ""); // wrong
+        ASSERT(chess.process("rook to d8") == ""); // wrong
+        ASSERT(chess.process("rook to d3") == "d4-d3");
+        ASSERT(chess.process("rook to e2") == "e5-e2");
    }

    {
+        // knight
        Chessboard chess;
-        ASSERT(chess.process("f4") == "f2-f4");
-        ASSERT(chess.process("e5") == "e7-e5");
-        ASSERT(chess.process("g4") == "g2-g4");
-        ASSERT(chess.process("d5") == "d7-d5");
-        ASSERT(chess.process("g1 f3") == "g1-f3");
-        ASSERT(chess.process("queen to h4") == "d8-h4");
-        ASSERT(!chess.grammar().empty());
+
+        ASSERT(chess.process("knight to c3, knight to c6") == "b1-c3 b8-c6");
+        ASSERT(chess.process("knight to c3") == ""); // wrong
+        ASSERT(chess.process("knight to a2") == ""); // wrong
+        ASSERT(chess.process("knight to b4") == ""); // wrong, white's turn
+        ASSERT(chess.process("knight to b5") == "c3-b5");
+        ASSERT(chess.process("knight to a5") == "c6-a5");
+        ASSERT(chess.process("knight to c7") == "b5-c7");
    }

    {
+        // bishop
        Chessboard chess;
-        ASSERT(chess.process("knight c3") == "b1-c3");
-        ASSERT(chess.process("knight c6") == "b8-c6");
-        ASSERT(chess.process("knight b5") == "c3-b5");
-        ASSERT(chess.process("knight f6") == "g8-f6");
-        ASSERT(chess.process("knight d6") == "b5-d6");
-        ASSERT(chess.process("knight d4") == "");
-        ASSERT(chess.process("d6") == "c7-d6");
-        ASSERT(chess.process("e4") == "e2-e4");
-        ASSERT(chess.process("knight d4") == "c6-d4");
-        ASSERT(chess.process("d3") == "d2-d3");
-        ASSERT(chess.process("knight e4") == "f6-e4");
-        ASSERT(chess.process("king to e2") == "");
-        ASSERT(chess.process("king to d2") == "");
+
+        ASSERT(chess.process("b3, b6, bishop to b2, bishop to b7") == "b2-b3 b7-b6 c1-b2 c8-b7");
+        ASSERT(chess.process("bishop to a1") == ""); // wrong
+        ASSERT(chess.process("bishop to h8") == ""); // wrong
+        ASSERT(chess.process("bishop to a6") == ""); // wrong, white's turn
+        ASSERT(chess.process("bishop to g7") == "b2-g7");
+    }
+
+    {
+        // queen
+        Chessboard chess;
+        ASSERT(chess.process("queen to d8") == ""); // wrong
+        ASSERT(chess.process("queen to f1") == ""); // wrong
+        ASSERT(chess.process("queen to h5") == ""); // wrong
+        ASSERT(chess.process("e3, d5, queen to h5, queen to d6") == "e2-e3 d7-d5 d1-h5 d8-d6");
+        ASSERT(chess.process("queen to c5") == ""); // wrong, white's turn
+        ASSERT(chess.process("queen to f7") == "h5-f7");
+    }
+
+    {
+        // king
+        Chessboard chess;
+        ASSERT(chess.process("d3, d6, king to d2, king to d7, king to c3, king to c6, king to c4") == "d2-d3 d7-d6 e1-d2 e8-d7 d2-c3 d7-c6 c3-c4");
+        ASSERT(chess.process("bishop to e6") == "c8-e6");
+        ASSERT(chess.process("king to b3") == "c4-b3"); // !! check check not implemented
    }
 }
--- a/examples/wchess/wchess.cmd/CMakeLists.txt
+++ b/examples/wchess/wchess.cmd/CMakeLists.txt
@ -4,5 +4,5 @@ if (WHISPER_SDL2)

    include(DefaultTargetOptions)

-    target_link_libraries(${TARGET} PRIVATE wchess-core common-sdl ${CMAKE_THREAD_LIBS_INIT})
-endif ()
+    target_link_libraries(${TARGET} PRIVATE libwchess common-sdl ${CMAKE_THREAD_LIBS_INIT})
+endif ()
--- a/examples/wchess/wchess.cmd/wchess.cmd.cpp
+++ b/examples/wchess/wchess.cmd/wchess.cmd.cpp
@ -7,7 +7,6 @@

 #include "WChess.h"
 #include "common-sdl.h"
-#include <iostream>

 #include <memory>
 #include <thread>
@ -110,61 +109,17 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
 }

 std::unique_ptr<WChess> g_wchess;
-int g_moveCount = 0;
-void set_move(const std::string & move, float) {
-    if (!move.empty()) {
-        g_moveCount++;
-        fprintf(stdout, "Move: %s\n\n", move.c_str());
-    }
-    else fprintf(stdout, "Move rejected\n\n");
-    fprintf(stdout, "%s\n", g_wchess->stringify_board().c_str());
-    fprintf(stdout, "%s\n", g_moveCount ? "White's turn" : "Black's turn");
+void set_moves(const std::string & moves) {
+    if (!moves.empty()) fprintf(stdout, "%s", g_wchess->stringify_board().c_str());
 }

 audio_async g_audio(30*1000);
-bool g_listening = false;
-std::vector<float> g_pcmf32;
-
-bool read_input() {
-    std::string input;
-    while (true) {
-        fprintf(stdout, "[(l)isten/(p)ause/(q)uit]: ");
-        std::cin >> input;
-        fprintf(stdout, "\n");
-        if (input[0] == 'q') {
-            fprintf(stdout, "Quitting\n");
-            return false;
-        }
-        if (input[0] == 'l') {
-            if (!g_listening) {
-                fprintf(stdout, "Listening\n");
-                g_listening = true;
-                g_pcmf32.clear();
-                g_audio.resume();
-                g_audio.clear();
-            }
-            else fprintf(stdout, "Still listening\n");
-            return true;
-        }
-        else {
-            if (g_listening) {
-                g_listening = false;
-                g_audio.get(0, g_pcmf32);
-                g_audio.pause();
-                fprintf(stdout, "Processing\n");
-            }
-            else fprintf(stdout, "Not listening\n");
-            return true;
-        }
-    }
-    return true;
+void get_audio(int ms, std::vector<float> & pcmf32_cur) {
+    g_audio.get(ms, pcmf32_cur);
 }

-bool get_audio(std::vector<float> & pcmf32_cur) {
-    if (!read_input()) return false;
-    if (!g_pcmf32.empty()) pcmf32_cur = std::move(g_pcmf32);
-    else pcmf32_cur.clear();
-    return true;
+void clear_audio() {
+    g_audio.clear();
 }

 int main(int argc, char ** argv) {
@ -182,14 +137,10 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context_params cparams = whisper_context_default_params();
+    struct whisper_context_params cparams;
    cparams.use_gpu = params.use_gpu;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
-    if (!ctx) {
-        fprintf(stderr, "%s: whisper_init_from_file_with_params() failed!\n", __func__);
-        return 1;
-    }

    // init audio

@ -198,35 +149,42 @@ int main(int argc, char ** argv) {
        return 1;
    }

-    struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
-    wparams.offset_ms        = 0;
-    wparams.translate        = false;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.print_realtime   = false;
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH);
+
    wparams.print_progress   = false;
-    wparams.print_timestamps = true;
-    wparams.print_special    = false;
-    wparams.no_timestamps    = true;
+    wparams.print_special    = params.print_special;
+    wparams.print_realtime   = false;
+    wparams.print_timestamps = !params.no_timestamps;
+    wparams.translate        = params.translate;
+    wparams.no_context       = true;
+    wparams.no_timestamps    = params.no_timestamps;
+    wparams.single_segment   = true;
+    wparams.max_tokens       = params.max_tokens;
+    wparams.language         = params.language.c_str();
+    wparams.n_threads        = params.n_threads;

-    wparams.max_tokens       = 32;
-    wparams.audio_ctx        = 768; // partial encoder context for better performance
+    wparams.audio_ctx = params.audio_ctx;
+    wparams.speed_up  = params.speed_up;

-    wparams.temperature     = 0.0f;
-    wparams.temperature_inc = 2.0f;
-    wparams.greedy.best_of  = 1;
+    wparams.temperature     = 0.4f;
+    wparams.temperature_inc = 1.0f;
+    wparams.greedy.best_of  = 5;

-    wparams.beam_search.beam_size = 1;
-
-    wparams.language         = "en";
-
-    wparams.grammar_penalty = 100.0;
+    wparams.beam_search.beam_size = 5;

    wparams.initial_prompt = params.context.data();

+    g_audio.resume();
+
+    // wait for 1 second to avoid any buffered noise
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    g_audio.clear();
+
    WChess::callbacks cb;
+    cb.check_running = sdl_poll_events;
    cb.get_audio = get_audio;
-    cb.set_move = set_move;
+    cb.set_moves = set_moves;
+    cb.clear_audio = clear_audio;

    WChess::settings s;
    s.vad_ms = 2000;
@ -237,9 +195,11 @@ int main(int argc, char ** argv) {
    s.print_energy = params.print_energy;

    g_wchess.reset(new WChess(ctx, wparams, cb, s));
-    set_move("start", 0);
+    set_moves("start");
    g_wchess->run();

+    g_audio.pause();
+
    whisper_print_timings(ctx);
    whisper_free(ctx);

--- a/examples/wchess/wchess.wasm/CMakeLists.txt
+++ b/examples/wchess/wchess.wasm/CMakeLists.txt
@ -8,7 +8,7 @@ include(DefaultTargetOptions)

 target_link_libraries(${TARGET} PRIVATE
    common
-    wchess-core
+    libwchess
    )

 unset(EXTRA_FLAGS)
--- a/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/CHANGELOG.md
+++ b/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/CHANGELOG.md
--- a/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/LICENSE.md
+++ b/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/LICENSE.md
--- a/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/README.md
+++ b/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/README.md
--- a/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/package.json
+++ b/examples/wchess/wchess.wasm/chessboardjs-1.0.0/js/chessboard-1.0.0/package.json
--- a/examples/wchess/wchess.wasm/index-tmpl.html
+++ b/examples/wchess/wchess.wasm/index-tmpl.html
@ -1,11 +1,7 @@
 <!doctype html>
 <html lang="en-us">
    <head>
-        <title>wchess : voice-controlled chess using Whisper + WebAssembly</title>
-        <script src="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"></script>
-
-        <meta name="viewport" content="width=device-width, initial-scale=0.7, maximum-scale=1, minimum-scale=0.7, user-scalable=no"/>
-        <meta name="apple-mobile-web-app-capable" content="yes" />
+        <title>wchess : Voice assistant example using Whisper + WebAssembly</title>

        <style>
            #output {
@ -27,145 +23,59 @@
                overflow-wrap: normal;
                overflow-x: scroll;
            }
-            .button {
-                background-color: #000000;
-                color: #FFFFFF;
-                padding: 20px;
-                border-radius: 10px;
-                -moz-border-radius: 10px;
-                -webkit-border-radius: 10px;
-                margin:10px;
-                width:  100px;
-                height:  50px;
-                -webkit-touch-callout: none; /* Safari */
-                -webkit-user-select: none; /* Chrome */
-                -moz-user-select: none; /* Firefox */
-                -ms-user-select: none; /* Internet Explorer/Edge */
-                user-select: none;
-            }
-            button[disabled]{
-                background-color: #cccccc;
-                color: #666666;
-                padding: 20px;
-                border-radius: 10px;
-                -moz-border-radius: 10px;
-                -webkit-border-radius: 10px;
-                margin:10px;
-                width: 100px;
-            }
-            .center {
-                display: flex;
-                justify-content: center;
-                align-items: center;
-                width: 500px;
-            }
-            #description {
-                width: 500px;
-            }
        </style>
        <link rel="stylesheet" href="css/chessboard-1.0.0.min.css" integrity="sha384-q94+BZtLrkL1/ohfjR8c6L+A6qzNH9R2hBLwyoAfu3i/WCvQjzL2RQJ3uNHDISdU" crossorigin="anonymous">
    </head>
-    <body>
+    <body onload="loadWhisper()">
        <div id="main-container">
-            <div id="description">
-                <b>wchess : voice-controlled chess using Whisper + WebAssembly</b>
+            <b>wchess : Voice assistant example using Whisper + WebAssembly</b>

-                <br><br>
+            <br><br>

-                This is a demonstration of using Whisper to recognize voice commands in the browser.
+            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/command.wasm">GitHub</a>.

-                <br><br>
+            <br><br>

-                Usage:<br>
+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |

-                <ul>
-                    <li>Select a Whisper model</li>
-                    <li>Accept the microphone permission request if prompted</li>
-                    <li>Hold the button and say a chess move (e.g. "Knight to c3")</li>
-                    <li>Release the button and wait for the move to be recognized</li>
-                    <li>Repeat</li>
-                </ul>
-
-                Examples:<br>
-
-                <ul>
-                    <li><b>"d4"</b></li>
-                    <li><b>"e2 e4"</b></li>
-                    <li><b>"Knight f3"</b></li>
-                    <li><b>"Bishop to b5"</b></li>
-                </ul>
-
-                Features:<br>
-
-                <ul>
-                    <li>Model quantization for reduced memory footprint (~42MB)</li>
-                    <li><a href="https://github.com/ggerganov/whisper.cpp/pull/1229">Grammar-based sampling</a> for improved recognition accuracy</li>
-                </ul>
-
-                <b>
-                Note that not all chess moves are supported. For example, castling and pawn promotion
-                currently do not work, but can be easily implemented. There could also be some bugs in
-                the move handling logic in general. The main reason for that is to keep the implementation
-                simple. The assumption is that a real application would already have a proper move
-                validation logic in place.<br><br>
-
-                The main purpose of this example is to demonstrate the capabilities of whisper.cpp and
-                its application in the browser for voice recognition locally on your device.
-                </b>
-
-                <br><br>
-
-                You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/wchess">GitHub</a>.
-
-                <br><br>
-
-                <b>More examples:</b>
-                    <a href="https://whisper.ggerganov.com/">main</a> |
-                    <a href="https://whisper.ggerganov.com/bench">bench</a> |
-                    <a href="https://whisper.ggerganov.com/stream">stream</a> |
-                    <a href="https://whisper.ggerganov.com/command">command</a> |
-                    <a href="https://whisper.ggerganov.com/talk">talk</a> |
-
-                <br><br>
-
-            </div>
+            <br><br>

            <hr>

            <div id="model-whisper">
                Whisper model: <span id="model-whisper-status"></span>
-                <button id="fetch-whisper-tiny-en" onclick="loadWhisper()">tiny.en (Q8_0, 42 MB)</button>
                <span id="fetch-whisper-progress"></span>
-                <br><br>
-                <button id="clear" onclick="clearCache()">Clear browser cache</button>
+                <button id="clear" onclick="clearCache()">Clear Cache</button>
                <!--
                    <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
                -->
            </div>

-            <div id="game">
-                <br>
-                <div id="chessboard" style="width: 500px"></div>
-                <script src="js/jquery-3.7.1.min.js"></script>
-                <script src="js/chessboard-1.0.0.min.js"></script>
-                <script>
-                    var board = Chessboard('chessboard', 'start')
-                    var move_count = 0;
-                </script>
+            <br>
+            <div id="myBoard" style="width: 400px"></div>
+            <script src="js/jquery-3.7.1.min.js"></script>
+            <script src="js/chessboard-1.0.0.min.js"></script>
+            <script>
+                var board = Chessboard('myBoard', 'start')
+            </script>

-                <br>
+            <br>

-                <div id="state">
-                    Status: <b><span id="state-status">select model</span></b>
+            <div id="input">
+                <button id="toggler" disabled>Hold</button>
+            </div>

-                    <div id="input" class="center">
-                        <button id="toggler" class="button" onselectstart="return false" style="display: none">Hold</button>
-                    </div>
+            <br>

-                    <pre id="state-grammar">[The grammar will be displayed here]</pre>
+            <div id="state">
+                Status: <b><span id="state-status">not started</span></b>

-                    <pre id="state-moves">[The moves will be displayed here]</pre>
-                </div>
+                <pre id="state-moves">[The moves will be displayed here]</pre>
            </div>

            <hr>
@ -183,6 +93,7 @@

            <ul>
                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
+                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
            </ul>

@ -207,9 +118,6 @@

            // model name
            var model_whisper = null;
-            var model_file = null;
-
-            var module_ready = null;

            var Module = {
                print: printTextarea,
@ -224,29 +132,17 @@
                },
                postRun: function() {
                    printTextarea('js: Module initialized successfully!');
-                    module_ready = true;
-                    initInstance();
+                    instance = Module.init('whisper.bin');
+
+                    if (instance) {
+                        printTextarea("js: whisper initialized, instance: " + instance);
+                    }
+                    else {
+                        printTextarea("js: failed to initialize whisper");
+                    }
                }
            };

-            function initInstance() {
-                if (!module_ready || !model_file || instance) return
-
-                instance = Module.init(model_file);
-
-                if (instance) {
-                    setStatus('Ready');
-                    printTextarea("js: whisper initialized, instance: " + instance);
-                }
-                else {
-                    printTextarea("js: failed to initialize whisper");
-                }
-            }
-
-            function setStatus(text) {
-                document.getElementById('state-status').innerHTML = text;
-            }
-
            //
            // fetch models
            //
@ -270,21 +166,35 @@

                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';

-                model_file = fname;
-                initInstance();
+                if (model_whisper != null) {
+                    document.getElementById('toggler').disabled = false;
+                }
            }

            function loadWhisper() {
-                setStatus('Loading')
-                //let url     = 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin';
-                let url     = 'https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q8_0.bin';
-                let dst     = 'whisper.bin';
-                let size_mb = 42;
+                // let urls = {
+                //     'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+                //     'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',

-                model_whisper = 'tiny.en-q8_0';
+                //     'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                //     'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
+                // };
+
+                // let sizes = {
+                //     'tiny.en': 75,
+                //     'base.en': 142,
+
+                //     'tiny-en-q5_1':   31,
+                //     'base-en-q5_1':   57,
+                // };
+
+                let url     = 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin';
+                let dst     = 'whisper.bin';
+                let size_mb = 75;
+
+                model_whisper = 'tiny.en';

                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model_whisper + '" ... ';
-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';

                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
@ -297,30 +207,6 @@
                };

                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
-
-                // init audio capture so that the user receives a permission request
-                {
-                    let context = new AudioContext({
-                        sampleRate: 16000,
-                        channelCount: 1,
-                        echoCancellation: false,
-                        autoGainControl:  true,
-                        noiseSuppression: true,
-                    });
-                    navigator.mediaDevices.getUserMedia({audio: true, video: false})
-                        .then(function(s) {
-                            stream = s;
-                            stream.getTracks().forEach(function(track) {
-                                track.stop();
-                            });
-                        })
-                        .catch(function(err) {
-                            printTextarea('js: error getting audio stream: ' + err);
-                        });
-                    context.close();
-                }
-
-                document.getElementById('toggler').style.display = 'block';
            }

            //
@ -339,9 +225,8 @@
            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;

            function stopRecording() {
-                if (mediaRecorder) {
-                    mediaRecorder.stop();
-                }
+                Module.set_status("paused");
+                mediaRecorder.stop();
            }

            function startRecording() {
@ -372,6 +257,10 @@

                            reader.onload = function(event) {
                                var buf = new Uint8Array(reader.result);
+
+                                if (!context) {
+                                    return;
+                                }
                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
                                    var source = offlineContext.createBufferSource();
@ -381,8 +270,11 @@

                                    offlineContext.startRendering().then(function(renderedBuffer) {
                                        let audio = renderedBuffer.getChannelData(0);
-                                        printTextarea('js: number of samples: ' + audio.length);
-                                        Module.set_audio(instance, audio);
+
+                                        if (instance) {
+                                            printTextarea('js: number of samples: ' + audio.length);
+                                            Module.set_audio(instance, audio);
+                                        }
                                    });

                                    mediaRecorder = null;
@ -411,34 +303,25 @@
            //

            var nLines = 0;
+            var intervalUpdate = null;
            var movesAll = '';

-            // document.body.addEventListener('keydown', function(event) {
-            //     if (event.keyCode === 32) {
-            //         document.getElementById('toggler').innerText = "";
-            //         onStart();
-            //     }
-            // }, true);
-
-            // document.body.addEventListener('keyup', function(event) {
-            //     if (event.keyCode === 32) {
-            //         document.getElementById('toggler').innerText = "Hold";
-            //         onStop();
-            //     }
-            // }, true);
-
-            document.getElementById('toggler').addEventListener("touchstart", function(event){
-                this.innerText = "";
-                onStart();
+            document.body.addEventListener('keydown', function(event) {
+                if (event.keyCode === 32) {
+                    document.getElementById('toggler').innerText = "Release";
+                    onStart();
+                }
            }, true);

-            document.getElementById('toggler').addEventListener("touchend", function(event){
-                this.innerText = "Hold";
-                onStop();
-            }, true)
+            document.body.addEventListener('keyup', function(event) {
+                if (event.keyCode === 32) {
+                    document.getElementById('toggler').innerText = "Hold";
+                    onStop();
+                }
+            }, true);

            document.getElementById('toggler').addEventListener('mousedown', function(event) {
-                this.innerText = "";
+                this.innerText = "Release";
                onStart();
            }, true);

@ -448,49 +331,43 @@
            }, true);

            function onStart() {
-                if (!instance) return;
-                setStatus('Listening');
+                if (!instance) {
+                    return;
+                }

                startRecording();
            }

            function onStop() {
-                setStatus('Processing');
                printTextarea('js: stopping recording ...');
                stopRecording();
-            }

-            function setMove(move, prob) {
-                if (move != null && move.length > 1) {
-                    let gameOver =  move[move.length - 1] === '#';
-                    if (gameOver) {
-                        move = move.substring(0, move.length - 1);
-                        document.getElementById('toggler').disabled = true;
-                    }
-                    board.move(move);
+                var interval = setInterval(function() {
+                    var moves = Module.get_moves();

-                    movesAll += move + ', prob = ' + prob.toFixed(2) + '% <br>';
-                    nLines++;
+                    if (moves != null && moves.length > 1) {
+                        clearInterval(interval);

-                    // if more than 10 lines, remove the first line
-                    if (nLines > 10) {
-                        var i = movesAll.indexOf('<br>');
-                        if (i > 0) {
-                            movesAll = movesAll.substring(i + 4);
-                            nLines--;
+                        for (move of moves.split(' ')) {
+                            board.move(move);
                        }
-                    }
-                    ++move_count;
-                    setStatus(gameOver ? 'Done' : move_count % 2 ? 'Black\'s turn' : 'White\'s turn');
-                    document.getElementById('state-moves').innerHTML = movesAll;
-                }
-                else {
-                    setStatus('Failed. ' + (move_count % 2 ? 'Black\'s turn' : 'White\'s turn'));
-                }
-            }

-            function setGrammar(grammar) {
-                document.getElementById('state-grammar').innerHTML = grammar;
+                        movesAll += moves + '<br>';
+                        nLines++;
+
+                        // if more than 10 lines, remove the first line
+                        if (nLines > 10) {
+                            var i = movesAll.indexOf('<br>');
+                            if (i > 0) {
+                                movesAll = movesAll.substring(i + 4);
+                                nLines--;
+                            }
+                        }
+
+                        document.getElementById('state-status').innerHTML = Module.get_status();
+                        document.getElementById('state-moves').innerHTML = movesAll;
+                    }
+                }, 100);
            }

        </script>
--- a/examples/wchess/wchess.wasm/wchess.wasm.cpp
+++ b/examples/wchess/wchess.wasm/wchess.wasm.cpp
@ -1,7 +1,7 @@
 #include <WChess.h>
-#include <emscripten.h>
 #include <emscripten/bind.h>

+#include <atomic>
 #include <thread>

 constexpr int N_THREAD = 8;
@ -11,29 +11,36 @@ std::vector<struct whisper_context *> g_contexts(4, nullptr);
 std::mutex  g_mutex;
 std::thread g_worker;

-std::condition_variable g_cv;
+std::atomic<bool> g_running(false);
+
+std::string g_status        = "";
+std::string g_status_forced = "";
+std::string g_moves         = "";

-bool g_running(false);
 std::vector<float> g_pcmf32;

-void set_move(const std::string & move, float prob) {
-    MAIN_THREAD_EM_ASM({
-        setMove(UTF8ToString($0), $1)
-    }, move.c_str(), prob);
+void set_status(const std::string & status) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_status = status;
 }

-void set_grammar(const std::string & grammar) {
-    MAIN_THREAD_EM_ASM({
-        setGrammar(UTF8ToString($0))
-    }, grammar.c_str());
+void set_moves(const std::string & moves) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_moves = moves;
 }

-bool get_audio(std::vector<float> & audio) {
-    std::unique_lock<std::mutex> lock(g_mutex);
-    g_cv.wait(lock, [] { return !g_running || !g_pcmf32.empty(); });
-    if (!g_running) return false;
-    audio = std::move(g_pcmf32);
-    return true;
+void get_audio(int /* ms */, std::vector<float> & audio) {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    audio = g_pcmf32;
+}
+
+bool check_running() {
+    return g_running;
+}
+
+void clear_audio() {
+    std::lock_guard<std::mutex> lock(g_mutex);
+    g_pcmf32.clear();
 }

 void wchess_main(size_t i) {
@ -51,11 +58,11 @@ void wchess_main(size_t i) {
    wparams.no_timestamps    = true;

    wparams.max_tokens       = 32;
-    wparams.audio_ctx        = 1280; // partial encoder context for better performance
+    wparams.audio_ctx        = 768; // partial encoder context for better performance

-    wparams.temperature      = 0.0f;
-    wparams.temperature_inc  = 2.0f;
-    wparams.greedy.best_of   = 1;
+    wparams.temperature     = 0.0f;
+    wparams.temperature_inc = 2.0f;
+    wparams.greedy.best_of  = 1;

    wparams.beam_search.beam_size = 1;

@ -67,12 +74,13 @@ void wchess_main(size_t i) {
    printf("command: using %d threads\n", wparams.n_threads);

    WChess::callbacks cb;
+    cb.set_status = set_status;
+    cb.check_running = check_running;
    cb.get_audio = get_audio;
-    cb.set_move = set_move;
-    cb.set_grammar = set_grammar;
+    cb.set_moves = set_moves;
+    cb.clear_audio = clear_audio;

    WChess(g_contexts[i], wparams, cb, {}).run();
-
    if (i < g_contexts.size()) {
        whisper_free(g_contexts[i]);
        g_contexts[i] = nullptr;
@ -104,11 +112,9 @@ EMSCRIPTEN_BINDINGS(command) {
    }));

    emscripten::function("free", emscripten::optional_override([](size_t /* index */) {
-        {
-            std::unique_lock<std::mutex> lock(g_mutex);
+        if (g_running) {
            g_running = false;
        }
-        g_cv.notify_one();
    }));

    emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
@ -134,8 +140,34 @@ EMSCRIPTEN_BINDINGS(command) {
            emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
            memoryView.call<void>("set", audio);
        }
-        g_cv.notify_one();

        return 0;
    }));
+
+    emscripten::function("get_moves", emscripten::optional_override([]() {
+        std::string moves;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            moves = std::move(g_moves);
+        }
+
+        return moves;
+    }));
+
+    emscripten::function("get_status", emscripten::optional_override([]() {
+        std::string status;
+
+        {
+            std::lock_guard<std::mutex> lock(g_mutex);
+            status = g_status_forced.empty() ? g_status : g_status_forced;
+        }
+
+        return status;
+    }));
+
+    emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+        std::lock_guard<std::mutex> lock(g_mutex);
+        g_status_forced = status;
+    }));
 }
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -12,47 +12,3 @@ To use:
 (PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)

 <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
-
-## CLBlast
-
-> [!NOTE]
-> - OpenCL does not have the same level of support as CUDA or Metal.
-> - Turning on CLBlast may degrade OpenCL performance if your device isn't already tuned. See [tuning.md](https://github.com/CNugteren/CLBlast/blob/162783a414969464ce3aa5adf5c2554afa5ee93e/doc/tuning.md#already-tuned-for-devices) for a list of devices that are already tuned and what to do if yours is missing.
-
-Build CLBlast.
-
-```
-# In path/to/CLBlast (we assume OpenCL-Headers relative location)
-$ANDROID_SDK_PATH/cmake/3.22.1/bin/cmake .. \
-    -DCMAKE_SYSTEM_NAME=Android \
-    -DCMAKE_SYSTEM_VERSION=33 \
-    -DCMAKE_ANDROID_ARCH_ABI=arm64-v8a \
-    -DCMAKE_ANDROID_NDK=$ANDROID_NDK_PATH \
-    -DCMAKE_ANDROID_STL_TYPE=c++_static \
-    -DOPENCL_ROOT=$(readlink -f ../../OpenCL-Headers) \
-    -DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH \
-    -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
-
-# Build libclblast.so
-make -j4
-```
-
-Pull `libGLES_mali.so` to `libOpenCL.so`.
-
-```bash
-# In path/to/whisper.android
-mkdir lib/src/main/jniLibs/arm64-v8a
-adb pull /system/vendor/lib64/egl/libGLES_mali.so lib/src/main/jniLibs/arm64-v8a/libOpenCL.so
-```
-
-In gradle.properties, set `GGML_HOME` to the location of GGML, as well as
-required options for turning on CLBlast.
-
-```
-GGML_HOME=/path/to/ggml
-GGML_CLBLAST=ON
-CLBLAST_HOME=/path/to/CLBlast
-OPENCL_LIB=/path/to/libOpenCL.so
-OPENCL_ROOT=/path/to/OpenCL-Headers
-```
-
--- a/examples/whisper.android/lib/build.gradle
+++ b/examples/whisper.android/lib/build.gradle
@ -16,28 +16,6 @@ android {
        ndk {
            abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
        }
-        externalNativeBuild {
-            cmake {
-                // When set, builds whisper.android against the version located
-                // at GGML_HOME instead of the copy bundled with whisper.cpp.
-                if (
-                    project.hasProperty('GGML_HOME') &&
-                    project.findProperty('GGML_CLBLAST') == 'ON'
-                ) {
-                    // Turning on CLBlast requires GGML_HOME
-                    arguments "-DGGML_HOME=${project.property('GGML_HOME')}",
-                         "-DGGML_CLBLAST=ON",
-                         "-DOPENCL_LIB=${project.property('OPENCL_LIB')}",
-                         "-DCLBLAST_HOME=${project.property('CLBLAST_HOME')}",
-                         "-DOPENCL_ROOT=${project.property('OPENCL_ROOT')}",
-                         "-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH",
-                         "-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=BOTH"
-                } else if (project.hasProperty('GGML_HOME')) {
-                    arguments "-DGGML_HOME=${project.property('GGML_HOME')}"
-                }
-
-            }
-        }
    }

    buildTypes {
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@ -3,28 +3,17 @@ cmake_minimum_required(VERSION 3.10)
 project(whisper.cpp)

 set(CMAKE_CXX_STANDARD 11)
-set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../..)
-
-# Path to external GGML, otherwise uses the copy in whisper.cpp.
-option(GGML_HOME       "whisper: Path to external GGML source" OFF)
+set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)

 set(
        SOURCE_FILES
-        ${WHISPER_LIB_DIR}/whisper.cpp
-        ${CMAKE_SOURCE_DIR}/jni.c
-)
-
-if (NOT GGML_HOME)
-    set(
-        SOURCE_FILES
-        ${SOURCE_FILES}
        ${WHISPER_LIB_DIR}/ggml.c
        ${WHISPER_LIB_DIR}/ggml-alloc.c
        ${WHISPER_LIB_DIR}/ggml-backend.c
        ${WHISPER_LIB_DIR}/ggml-quants.c
-
-    )
-endif()
+        ${WHISPER_LIB_DIR}/whisper.cpp
+        ${CMAKE_SOURCE_DIR}/jni.c
+)

 find_library(LOG_LIB log)

@ -35,12 +24,12 @@ function(build_library target_name)
        ${SOURCE_FILES}
    )

+    target_link_libraries(${target_name} ${LOG_LIB} android)
+
    if (${target_name} STREQUAL "whisper_v8fp16_va")
        target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
-        set(GGML_COMPILE_OPTIONS                      -march=armv8.2-a+fp16)
    elseif (${target_name} STREQUAL "whisper_vfpv4")
        target_compile_options(${target_name} PRIVATE -mfpu=neon-vfpv4)
-        set(GGML_COMPILE_OPTIONS                      -mfpu=neon-vfpv4)
    endif ()

    if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
@ -54,27 +43,14 @@ function(build_library target_name)
        target_link_options(${target_name} PRIVATE -flto)

    endif ()
-
-    if (GGML_HOME)
-        include(FetchContent)
-        FetchContent_Declare(ggml SOURCE_DIR ${GGML_HOME})
-        FetchContent_MakeAvailable(ggml)
-
-        target_compile_options(ggml PRIVATE ${GGML_COMPILE_OPTIONS})
-        target_link_libraries(${target_name} ${LOG_LIB} android ggml)
-    else()
-        target_link_libraries(${target_name} ${LOG_LIB} android)
-    endif()
-
-
 endfunction()

+build_library("whisper") # Default target
+
 if (${ANDROID_ABI} STREQUAL "arm64-v8a")
    build_library("whisper_v8fp16_va")
 elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
    build_library("whisper_vfpv4")
 endif ()

-build_library("whisper") # Default target
-
 include_directories(${WHISPER_LIB_DIR})
--- a/examples/whisper.android/lib/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/lib/src/main/jni/whisper/jni.c
@ -228,7 +228,6 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, j
    UNUSED(thiz);
    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
-    return string;
 }

 JNIEXPORT jstring JNICALL
@ -237,5 +236,4 @@ Java_com_whispercpp_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *en
    UNUSED(thiz);
    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
-    return string;
 }
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -11,11 +11,11 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b

 ## Usage

-```bash
+```java
 git clone https://github.com/ggerganov/whisper.cpp
 open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/

-# if you don't want to convert a Core ML model, you can skip this step by create dummy model
+// If you don't want to convert a Core ML model, you can skip this step by create dummy model
 mkdir models/ggml-base.en-encoder.mlmodelc
 ```

--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -206,7 +206,6 @@ void AudioInputCallback(void * inUserData,
        params.offset_ms        = 0;
        params.no_context       = true;
        params.single_segment   = self->stateInp.isRealtime;
-        params.no_timestamps    = params.single_segment;

        CFTimeInterval startTime = CACurrentMediaTime();

--- a/examples/whisper.swiftui/.gitignore
+++ b/examples/whisper.swiftui/.gitignore
@ -1,2 +0,0 @@
-xcuserdata
-xcshareddata
--- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
+++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
@ -8,15 +8,15 @@ enum WhisperError: Error {
 // Meet Whisper C++ constraint: Don't access from more than one thread at a time.
 actor WhisperContext {
    private var context: OpaquePointer
-
+    
    init(context: OpaquePointer) {
        self.context = context
    }
-
+    
    deinit {
        whisper_free(context)
    }
-
+    
    func fullTranscribe(samples: [Float]) {
        // Leave 2 processors free (i.e. the high-efficiency cores).
        let maxThreads = max(1, min(8, cpuCount() - 2))
@ -24,17 +24,17 @@ actor WhisperContext {
        var params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY)
        "en".withCString { en in
            // Adapted from whisper.objc
-            params.print_realtime   = true
-            params.print_progress   = false
+            params.print_realtime = true
+            params.print_progress = false
            params.print_timestamps = true
-            params.print_special    = false
-            params.translate        = false
-            params.language         = en
-            params.n_threads        = Int32(maxThreads)
-            params.offset_ms        = 0
-            params.no_context       = true
-            params.single_segment   = false
-
+            params.print_special = false
+            params.translate = false
+            params.language = en
+            params.n_threads = Int32(maxThreads)
+            params.offset_ms = 0
+            params.no_context = true
+            params.single_segment = false
+            
            whisper_reset_timings(context)
            print("About to run whisper_full")
            samples.withUnsafeBufferPointer { samples in
@ -46,7 +46,7 @@ actor WhisperContext {
            }
        }
    }
-
+    
    func getTranscription() -> String {
        var transcription = ""
        for i in 0..<whisper_full_n_segments(context) {
@ -54,7 +54,7 @@ actor WhisperContext {
        }
        return transcription
    }
-
+    
    static func createContext(path: String) throws -> WhisperContext {
        var params = whisper_context_default_params()
 #if targetEnvironment(simulator)
--- a/extra/bench.py
+++ b/extra/bench.py
@ -61,9 +61,7 @@ models = [
    "ggml-small.bin",
    "ggml-medium.en.bin",
    "ggml-medium.bin",
-    "ggml-large-v1.bin",
-    "ggml-large-v2.bin",
-    "ggml-large-v3.bin",
+    "ggml-large.bin",
 ]


--- a/extra/sync-ggml-am.sh
+++ b/extra/sync-ggml-am.sh
@ -1,178 +0,0 @@
-#!/bin/bash
-#
-# Synchronize ggml changes to whisper.cpp
-#
-# Usage:
-#
-#   $ cd /path/to/whisper.cpp
-#   $ ./extra/sync-ggml-am.sh -skip hash0,hash1,hash2...
-#
-
-set -e
-
-sd=$(dirname $0)
-cd $sd/../
-
-SRC_WHISPER=$(pwd)
-SRC_GGML=$(cd ../ggml; pwd)
-
-if [ ! -d $SRC_GGML ]; then
-    echo "ggml not found at $SRC_GGML"
-    exit 1
-fi
-
-lc=$(cat $SRC_WHISPER/extra/sync-ggml.last)
-echo "Syncing ggml changes since commit $lc"
-
-to_skip=""
-if [ "$1" == "-skip" ]; then
-    to_skip=$2
-fi
-
-cd $SRC_GGML
-
-git log --oneline $lc..HEAD
-git log --oneline $lc..HEAD --reverse | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits
-
-if [ ! -s $SRC_WHISPER/ggml-commits ]; then
-    rm -v $SRC_WHISPER/ggml-commits
-    echo "No new commits"
-    exit 0
-fi
-
-if [ -f $SRC_WHISPER/ggml-src.patch ]; then
-    rm -v $SRC_WHISPER/ggml-src.patch
-fi
-
-while read c; do
-    if [ -n "$to_skip" ]; then
-        if [[ $to_skip == *"$c"* ]]; then
-            echo "Skipping $c"
-            continue
-        fi
-    fi
-
-    git format-patch -k $c~1..$c --stdout -- \
-        include/ggml/ggml*.h \
-        src/ggml*.h \
-        src/ggml*.c \
-        src/ggml*.cpp \
-        src/ggml*.m \
-        src/ggml*.metal \
-        src/ggml*.cu \
-        examples/common.h \
-        examples/common.cpp \
-        examples/common-ggml.h \
-        examples/common-ggml.cpp \
-        examples/whisper/whisper.h \
-        examples/whisper/whisper.cpp \
-        examples/whisper/main.cpp \
-        examples/whisper/quantize.cpp \
-        >> $SRC_WHISPER/ggml-src.patch
-done < $SRC_WHISPER/ggml-commits
-
-rm -v $SRC_WHISPER/ggml-commits
-
-# delete files if empty
-if [ ! -s $SRC_WHISPER/ggml-src.patch ]; then
-    rm -v $SRC_WHISPER/ggml-src.patch
-fi
-
-cd $SRC_WHISPER
-
-if [ -f $SRC_WHISPER/ggml-src.patch ]; then
-    # replace PR numbers
-    #
-    # Subject: some text (#1234)
-    # Subject: some text (ggml/1234)
-    cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
-    mv ggml-src.patch.tmp ggml-src.patch
-
-    cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
-    mv ggml-src.patch.tmp ggml-src.patch
-
-    # replace filenames:
-    #
-    # src/ggml.c                  -> ggml.c
-    # src/ggml-alloc.c            -> ggml-alloc.c
-    # src/ggml-backend-impl.h     -> ggml-backend-impl.h
-    # src/ggml-backend.c          -> ggml-backend.c
-    # src/ggml-cuda.cu            -> ggml-cuda.cu
-    # src/ggml-cuda.h             -> ggml-cuda.h
-    # src/ggml-impl.h             -> ggml-impl.h
-    # src/ggml-kompute.cpp        -> ggml-kompute.cpp
-    # src/ggml-kompute.h          -> ggml-kompute.h
-    # src/ggml-metal.h            -> ggml-metal.h
-    # src/ggml-metal.m            -> ggml-metal.m
-    # src/ggml-mpi.h              -> ggml-mpi.h
-    # src/ggml-mpi.c              -> ggml-mpi.c
-    # src/ggml-opencl.cpp         -> ggml-opencl.cpp
-    # src/ggml-opencl.h           -> ggml-opencl.h
-    # src/ggml-quants.c           -> ggml-quants.c
-    # src/ggml-quants.h           -> ggml-quants.h
-    # src/ggml-sycl.cpp           -> ggml-sycl.cpp
-    # src/ggml-sycl.h             -> ggml-sycl.h
-    # src/ggml-vulkan.cpp         -> ggml-vulkan.cpp
-    # src/ggml-vulkan.h           -> ggml-vulkan.h
-    # include/ggml/ggml.h         -> ggml.h
-    # include/ggml/ggml-alloc.h   -> ggml-alloc.h
-    # include/ggml/ggml-backend.h -> ggml-backend.h
-    #
-    # examples/common.h           -> examples/common.h
-    # examples/common.cpp         -> examples/common.cpp
-    # examples/common-ggml.h      -> examples/common-ggml.h
-    # examples/common-ggml.cpp    -> examples/common-ggml.cpp
-    #
-    # examples/whisper/whisper.h    -> whisper.h
-    # examples/whisper/whisper.cpp  -> whisper.cpp
-    # examples/whisper/main.cpp     -> examples/main/main.cpp
-    # examples/whisper/quantize.cpp -> examples/quantize/quantize.cpp
-
-    cat ggml-src.patch | sed \
-        -e 's/src\/ggml\.c/ggml.c/g' \
-        -e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
-        -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
-        -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
-        -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
-        -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
-        -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
-        -e 's/src\/ggml-kompute\.cpp/ggml-kompute.cpp/g' \
-        -e 's/src\/ggml-kompute\.h/ggml-kompute.h/g' \
-        -e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
-        -e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
-        -e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
-        -e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
-        -e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
-        -e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
-        -e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
-        -e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
-        -e 's/src\/ggml-sycl\.cpp/ggml-sycl.cpp/g' \
-        -e 's/src\/ggml-sycl\.h/ggml-sycl.h/g' \
-        -e 's/src\/ggml-vulkan\.cpp/ggml-vulkan.cpp/g' \
-        -e 's/src\/ggml-vulkan\.h/ggml-vulkan.h/g' \
-        -e 's/include\/ggml\/ggml\.h/ggml.h/g' \
-        -e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
-        -e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
-        -e 's/examples\/common\.h/examples\/common.h/g' \
-        -e 's/examples\/common\.cpp/examples\/common.cpp/g' \
-        -e 's/examples\/common-ggml\.h/examples\/common-ggml.h/g' \
-        -e 's/examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
-        -e 's/examples\/whisper\/whisper\.h/whisper.h/g' \
-        -e 's/examples\/whisper\/whisper\.cpp/whisper.cpp/g' \
-        -e 's/examples\/whisper\/main\.cpp/examples\/main\/main.cpp/g' \
-        -e 's/examples\/whisper\/quantize\.cpp/examples\/quantize\/quantize.cpp/g' \
-        > ggml-src.patch.tmp
-    mv ggml-src.patch.tmp ggml-src.patch
-
-    git am ggml-src.patch
-
-    rm -v $SRC_WHISPER/ggml-src.patch
-fi
-
-# update last commit
-cd $SRC_GGML
-git log -1 --format=%H > $SRC_WHISPER/extra/sync-ggml.last
-
-echo "Done"
-
-exit 0
--- a/extra/sync-ggml.last
+++ b/extra/sync-ggml.last
@ -1 +0,0 @@
-15438356acd7ad1b182c66272eb9625828f5ae7a
--- a/extra/sync-ggml.sh
+++ b/extra/sync-ggml.sh
@ -7,8 +7,6 @@ cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h
 cp -rpv ../ggml/src/ggml-backend.c      ./ggml-backend.c
 cp -rpv ../ggml/src/ggml-cuda.cu        ./ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-cuda.h         ./ggml-cuda.h
-cp -rpv ../ggml/src/ggml-kompute.cpp    ./ggml-kompute.cpp
-cp -rpv ../ggml/src/ggml-kompute.h      ./ggml-kompute.h
 cp -rpv ../ggml/src/ggml-metal.h        ./ggml-metal.h
 cp -rpv ../ggml/src/ggml-metal.m        ./ggml-metal.m
 cp -rpv ../ggml/src/ggml-metal.metal    ./ggml-metal.metal
@ -18,10 +16,6 @@ cp -rpv ../ggml/src/ggml-opencl.cpp     ./ggml-opencl.cpp
 cp -rpv ../ggml/src/ggml-opencl.h       ./ggml-opencl.h
 cp -rpv ../ggml/src/ggml-quants.c       ./ggml-quants.c
 cp -rpv ../ggml/src/ggml-quants.h       ./ggml-quants.h
-cp -rpv ../ggml/src/ggml-sycl.cpp       ./ggml-sycl.cpp
-cp -rpv ../ggml/src/ggml-sycl.h         ./ggml-sycl.h
-cp -rpv ../ggml/src/ggml-vulkan.cpp     ./ggml-vulkan.cpp
-cp -rpv ../ggml/src/ggml-vulkan.h       ./ggml-vulkan.h

 cp -rpv ../ggml/include/ggml/ggml.h         ./ggml.h
 cp -rpv ../ggml/include/ggml/ggml-alloc.h   ./ggml-alloc.h
--- a/extra/sync-llama.sh
+++ b/extra/sync-llama.sh
@ -1,5 +0,0 @@
-#!/bin/bash
-
-cp -rpv ../llama.cpp/llama.h   ./examples/talk-llama/llama.h
-cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
-cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
--- a/ggml-alloc.h
+++ b/ggml-alloc.h
@ -6,62 +6,79 @@
 extern "C" {
 #endif

-typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+struct ggml_backend;
+struct ggml_backend_buffer;
+
+//
+// Legacy API
+//
+
+typedef struct ggml_allocr * ggml_allocr_t;
+
+// initialize allocator for use with CPU backend only
+GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
+
+// initialize allocator for use with ggml-backend
+GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
+GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
+GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
+
+GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
+
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
+
+GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
+GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
+GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
+GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
+
+GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
+
+//
+// ggml-backend v2 API
+//
+
+// Seperate tensor and graph allocator objects
+// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
+// The original API is kept as a wrapper around the new API

 // Tensor allocator
 typedef struct ggml_tallocr * ggml_tallocr_t;

-GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
-GGML_API void           ggml_tallocr_free(ggml_tallocr_t talloc);
-GGML_API void           ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
+GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
+GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
+GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
+GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
+
+GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
+
+GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
+GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
+GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
+GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
+

 // Graph allocator
-/*
-  Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
-
-    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
-    ggml_gallocr_reserve(galloc, build_graph(max_batch));
-
-    // allocate the graph
-    struct ggml_cgraph * graph = build_graph(batch);
-    ggml_gallocr_alloc_graph(galloc, graph);
-
-    printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
-
-    // evaluate the graph
-    ggml_backend_graph_compute(backend, graph);
-*/
-
-// special tensor flags for use with the graph allocator:
-//   ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
-//   ggml_set_output(): output tensors are never freed and never overwritten
-
 typedef struct ggml_gallocr * ggml_gallocr_t;

-GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
-GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
-GGML_API void           ggml_gallocr_free(ggml_gallocr_t galloc);
+GGML_API ggml_gallocr_t ggml_gallocr_new(void);
+GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);

-// pre-allocate buffers from a measure graph - does not allocate or modify the graph
-// call with a worst-case graph to avoid buffer reallocations
-// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
-// returns false if the buffer allocation failed
-GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
+GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
+GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);

-// automatic reallocation if the topology changes when using a single buffer
-// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
-GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
-
-GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
-
-// Utils
-// Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
+// Allocate tensors from the allocators given by the hash table
+GGML_API void   ggml_gallocr_alloc_graph_n(
+                    ggml_gallocr_t galloc,
+                    struct ggml_cgraph * graph,
+                    struct ggml_hash_set hash_set,
+                    ggml_tallocr_t * hash_node_talloc);

 #ifdef  __cplusplus
 }
--- a/ggml-backend-impl.h
+++ b/ggml-backend-impl.h
@ -12,63 +12,31 @@ extern "C" {
    // Backend buffer
    //

-    // buffer type
-    typedef void * ggml_backend_buffer_type_context_t;
-
-    struct ggml_backend_buffer_type_i {
-        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
-        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
-        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
-        // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_buffer_type_context_t context;
-    };
-
-    // buffer
    typedef void * ggml_backend_buffer_context_t;

    struct ggml_backend_buffer_i {
-        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
-        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
-        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
+        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
+        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
+        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
+        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
    };

    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
+        struct ggml_backend_buffer_i iface;
+
+        ggml_backend_t                backend;
        ggml_backend_buffer_context_t context;
+
        size_t size;
-        enum ggml_backend_buffer_usage usage;
    };

-    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t      buft,
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+            struct ggml_backend                  * backend,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);

-    // do not use directly, use ggml_backend_tensor_copy instead
-    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
-
-    // buffer that contains a collection of buffers
-    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-
    //
    // Backend
    //
@ -76,31 +44,36 @@ extern "C" {
    typedef void * ggml_backend_context_t;

    struct ggml_backend_i {
-        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
+        const char * (*get_name)(ggml_backend_t backend);

-        void (*GGML_CALL free)(ggml_backend_t backend);
+        void (*free)(ggml_backend_t backend);

        // buffer allocation
-        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
+        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);

-        // (optional) asynchronous tensor data access
-        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        // get buffer alignment
+        size_t (*get_alignment)(ggml_backend_t backend);

-        // (optional) complete all pending operations
-        void (*GGML_CALL synchronize)(ggml_backend_t backend);
+        // tensor data access
+        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void (*synchronize)     (ggml_backend_t backend);
+
+        // (optional) copy tensor between different backends, allow for single-copy tranfers
+        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);

        // compute graph with a plan
-        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

-        // compute graph without a plan (async)
-        bool (*GGML_CALL graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        // compute graph without a plan
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

        // check if the backend supports an operation
-        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
    };

    struct ggml_backend {
@ -109,14 +82,6 @@ extern "C" {
        ggml_backend_context_t context;
    };

-    //
-    // Backend registry
-    //
-
-    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
-
-    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml-backend.c
+++ b/ggml-backend.c
--- a/ggml-backend.h
+++ b/ggml-backend.h
@ -7,62 +7,44 @@
 extern "C" {
 #endif

-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
    //
    // Backend buffer
    //

-    // buffer type
-    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
-    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
-    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
+    struct ggml_backend_buffer;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;

-    // buffer
-    enum ggml_backend_buffer_usage {
-        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
-        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
-    };
-
-    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+    // backend buffer functions
+    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);

    //
    // Backend
    //

+    struct ggml_backend;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+
+    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);

    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);

-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
-    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);

-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);

-    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);

    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);

@ -70,12 +52,11 @@ extern "C" {

    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);

    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy

    //
    // CPU backend
@ -83,32 +64,12 @@ extern "C" {

    GGML_API ggml_backend_t ggml_backend_cpu_init(void);

-    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
+    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);

    // Create a backend buffer from an existing pointer
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);

-    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    //
-    // Backend registry
-    //
-
-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
-
-    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);

    //
    // Backend scheduler
@ -130,7 +91,11 @@ extern "C" {

        // in build_graph:
        build_graph(...) {
-            // manually assign nodes to a backend (optional, should not be needed in most cases)
+            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
+            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
+            ggml_allocr_alloc(alloc_cpu, tensor);
+
+            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
        }
@ -148,60 +113,23 @@ extern "C" {
    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;

-    // when ask == true, the scheduler wants to know if the user wants to observe this node
-    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
-    //
-    // when ask == false, the scheduler is passing the node tensor to the user for observation
-    // if the user returns false, the scheduler will cancel the graph compute
-    //
-    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-
    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t  ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
-    GGML_API void                  ggml_backend_sched_free(ggml_backend_sched_t sched);
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
+
+    GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
+
    // Initialize backend buffers from a measure graph
-    GGML_API bool                  ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-    // Get the number of splits of the last graph
-    GGML_API int                   ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
+    GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);

-    GGML_API size_t                ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);

-    GGML_API void                  ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-    GGML_API ggml_backend_t        ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
-
-    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                  ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
-
-    // Reset all assignments and allocators - must be called before changing the node backends
-    GGML_API void                  ggml_backend_sched_reset(ggml_backend_sched_t sched);
-
-    // Set a callback to be called for each resulting node during graph compute
-    GGML_API void                  ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
-
-    //
-    // Utils
-    //
-
-    struct ggml_backend_graph_copy {
-        ggml_backend_buffer_t buffer;
-        struct ggml_context * ctx_allocated;
-        struct ggml_context * ctx_unallocated;
-        struct ggml_cgraph * graph;
-    };
-
-    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
-    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
-    // Compare the output of two backends
-    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
-    // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);

+    // Allocate a graph on the backend scheduler
+    GGML_API void ggml_backend_sched_graph_compute(
+            ggml_backend_sched_t sched,
+            struct ggml_cgraph * graph);

 #ifdef  __cplusplus
 }
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -18,34 +18,38 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16

 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
-GGML_API GGML_CALL void   ggml_init_cublas(void);
+GGML_API void   ggml_init_cublas(void);

 // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
+GGML_API bool   ggml_cublas_loaded(void);

-GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
-GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_host_free(void * ptr);

-GGML_API GGML_CALL bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API GGML_CALL bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);

-GGML_API GGML_CALL int    ggml_cuda_get_device_count(void);
-GGML_API GGML_CALL void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_set_main_device(int main_device);
+GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_cuda_free_scratch(void);
+GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_cuda_get_device_count(void);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);

 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
-
-GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
-
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-// split tensor buffer that splits matrices by rows across multiple devices
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-
-GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use

 #ifdef  __cplusplus
 }
--- a/ggml-impl.h
+++ b/ggml-impl.h
@ -5,7 +5,6 @@
 // GGML internal header

 #include <assert.h>
-#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
@ -19,7 +18,6 @@ extern "C" {
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
-#ifndef __cplusplus
 #ifndef static_assert
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
@ -27,7 +25,6 @@ extern "C" {
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
-#endif

 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
@ -230,14 +227,12 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 #define GGML_HASHTABLE_FULL ((size_t)-1)
 #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)

-struct ggml_hash_set ggml_hash_set_new(size_t size);
-
 bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);

 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);

-// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);

 // return index, asserts if table is full
--- a/ggml-kompute.cpp
+++ b/ggml-kompute.cpp
--- a/ggml-kompute.h
+++ b/ggml-kompute.h
@ -1,46 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#include <stdbool.h>
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct ggml_vk_device {
-    int index;
-    int type; // same as VkPhysicalDeviceType
-    size_t heapSize;
-    const char * name;
-    const char * vendor;
-    int subgroupSize;
-    uint64_t bufferAlignment;
-    uint64_t maxAlloc;
-};
-
-struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
-bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
-bool ggml_vk_has_vulkan(void);
-bool ggml_vk_has_device(void);
-struct ggml_vk_device ggml_vk_current_device(void);
-
-//
-// backend API
-//
-
-// forward declaration
-typedef struct ggml_backend * ggml_backend_t;
-
-GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
-
-GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
-
-#ifdef __cplusplus
-}
-#endif
--- a/ggml-metal.h
+++ b/ggml-metal.h
@ -27,6 +27,7 @@

 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
+#define GGML_METAL_MAX_COMMAND_BUFFERS 32

 struct ggml_tensor;
 struct ggml_cgraph;
@ -35,31 +36,70 @@ struct ggml_cgraph;
 extern "C" {
 #endif

+//
+// internal API
+// temporary exposed to user-code
+//
+
+struct ggml_metal_context;
+
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
+// number of command buffers to use
+struct ggml_metal_context * ggml_metal_init(int n_cb);
+void ggml_metal_free(struct ggml_metal_context * ctx);
+
+void * ggml_metal_host_malloc(size_t n);
+void   ggml_metal_host_free  (void * data);
+
+// set the number of command buffers to use
+void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
+
+// creates a mapping between a host memory buffer and a device memory buffer
+// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
+// - the mapping is used during computation to determine the arguments of the compute kernels
+// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
+// - max_size specifies the maximum size of a tensor and is used to create shared views such
+//   that it is guaranteed that the tensor will fit in at least one of the views
+//
+bool ggml_metal_add_buffer(
+        struct ggml_metal_context * ctx,
+                       const char * name,
+                             void * data,
+                           size_t   size,
+                           size_t   max_size);
+
+// set data from host memory into the device
+void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// get data from the device into host memory
+void ggml_metal_get_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
+
+// try to find operations that can be run concurrently in the graph
+// you should run it again if the topology of your graph changes
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
+
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
+
+// same as ggml_graph_compute but uses Metal
+// creates gf->n_threads command buffers in parallel
+void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+
 //
 // backend API
 // user-code should use only these functions
 //

-GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
-
 GGML_API ggml_backend_t ggml_backend_metal_init(void);

 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);

-GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
-
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);

-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
-
-// helper to check if the device supports a specific family
-// ideally, the user code should be doing these checks
-// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
-GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
-
-// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
-GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
-
 #ifdef __cplusplus
 }
 #endif
--- a/ggml-metal.m
+++ b/ggml-metal.m
--- a/ggml-metal.metal
+++ b/ggml-metal.metal
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -1,19 +1,20 @@
-#include "ggml.h"
 #include "ggml-opencl.h"
-#include "ggml-backend-impl.h"

 #include <array>
 #include <atomic>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <limits>
 #include <sstream>
 #include <vector>
+#include <limits>

-#define CL_TARGET_OPENCL_VERSION 120
+#define CL_TARGET_OPENCL_VERSION 110
 #include <clblast.h>

+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
@ -714,6 +715,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
        dst[row] = tmp[0];
    }
 }
+
 );


@ -783,7 +785,6 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
        dst[row] = tmp[0];
    }
 }
-
 );


@ -799,18 +800,6 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
 }
 );

-std::string add_template = MULTILINE_QUOTE(
-__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
-    const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
-
-    if (i >= get_global_size(0)) {
-        return;
-    }
-
-    dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
-}
-);
-
 #define CL_CHECK(err)                                               \
    do {                                                            \
        cl_int err_ = (err);                                        \
@ -890,7 +879,6 @@ static std::string generate_kernels() {
        }
        src << mul_kernel << '\n';
    }
-    src << add_template << '\n';

    return src.str();
 }
@ -906,7 +894,6 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
 static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
 static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
 static cl_kernel mul_f32_cl;
-static cl_kernel add_f32_cl;
 static bool fp16_support;

 static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
@ -944,12 +931,6 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co
 }

 void ggml_cl_init(void) {
-    static bool initialized = false;
-    if (initialized) {
-        return;
-    }
-    initialized = true;
-
    cl_int err;

    struct cl_device;
@ -1114,10 +1095,9 @@ void ggml_cl_init(void) {
    char *ext_buffer = (char *)alloca(ext_str_size + 1);
    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
    ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
-    // Disabled due to faulty outputs
    // Check if ext_buffer contains cl_khr_fp16
-    fp16_support = false;  // strstr(ext_buffer, "cl_khr_fp16") != NULL;
-    // fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
+    fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
+    fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");

    cl_context_properties properties[] = {
        (intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
@ -1165,8 +1145,6 @@ void ggml_cl_init(void) {

    // mul kernel
    CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
-
-    CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
 }

 static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
@ -1475,70 +1453,6 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
    ggml_cl_mul_f32(src0, src1, dst);
 }

-static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int nb2  = dst->nb[2];
-    const int nb3  = dst->nb[3];
-    size_t x_size;
-    size_t d_size;
-
-    cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
-    cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
-    cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
-
-
-    for (int64_t i03 = 0; i03 < ne03; i03++) {
-        for (int64_t i02 = 0; i02 < ne02; i02++) {
-            cl_event ev;
-
-            // copy src0 to device
-            CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
-
-            const int64_t i13 = i03%ne13;
-            const int64_t i12 = i02%ne12;
-            const int i1 = i13*ne12*ne11 + i12*ne11;
-
-            cl_int x_offset = 0;
-            cl_int y_offset = i1*ne10;
-            cl_int d_offset = 0;
-
-            size_t global = ne00 * ne01;
-            cl_int ky = ne10 * ne11;
-
-            CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
-            CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
-            CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
-
-            CL_CHECK(clReleaseEvent(ev));
-            CL_CHECK(clFinish(queue));
-
-            // copy dst to host
-            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
-            CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
-        }
-    }
-    ggml_cl_pool_free(d_X, x_size);
-    ggml_cl_pool_free(d_D, d_size);
-}
-
-void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
-    ggml_cl_add_f32(src0, src1, dst);
-}
-
 static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
@ -1571,8 +1485,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    } else {
        d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
    }
-    cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
-    cl_mem d_D =  dst->backend == GGML_BACKEND_GPU ? (cl_mem)  dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
+    cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
+    cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);

    size_t x_offset = 0;

@ -1589,9 +1503,7 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr

                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
                    // copy src1 to device
-                    if (src1->backend == GGML_BACKEND_CPU) {
-                        CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));
-                    }
+                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL));

                    CL_CHECK(clFinish(queue));

@ -1612,10 +1524,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
                    }

                    // copy dst to host
-                    if (dst->backend == GGML_BACKEND_CPU) {
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
-                    }
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL));
                }
            }
        }
@ -1624,12 +1534,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
    if (src0->backend != GGML_BACKEND_GPU) {
        ggml_cl_pool_free(d_X, x_size);
    }
-    if (src1->backend != GGML_BACKEND_GPU) {
-        ggml_cl_pool_free(d_Y, y_size);
-    }
-    if (dst->backend != GGML_BACKEND_GPU) {
-        ggml_cl_pool_free(d_D, d_size);
-    }
+    ggml_cl_pool_free(d_Y, y_size);
+    ggml_cl_pool_free(d_D, d_size);
 }

 static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
@ -1694,8 +1600,6 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                    CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL));
                }

-                // FIXME: convert on device
-
                for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) {
                    // convert src1 to fp16
                    // TODO: use multiple threads
@ -1741,13 +1645,11 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
                    }

                    // copy dst to host, then convert to float
-                    if (dst->backend == GGML_BACKEND_CPU) {
-                        CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
-                        float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-                        ggml_fp16_to_fp32_row(tmp, d, d_ne);
-                    } else {
-                        // FIXME: convert dst to fp32 on device
-                    }
+                    CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL));
+
+                    float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+                    ggml_fp16_to_fp32_row(tmp, d, d_ne);
                }
            }
        }
@ -1901,7 +1803,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
 }


-bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) {
+bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
    const int64_t ne10 = src1->ne[0];

    const int64_t ne0 = dst->ne[0];
@ -1995,302 +1897,3 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
    tensor->extra = dst;
    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 }
-
-// ggml-backend
-
-// buffer
-
-struct ggml_backend_opencl_buffer_context {
-    ~ggml_backend_opencl_buffer_context() {
-        if (buffer) {
-            clReleaseMemObject(buffer);
-        }
-        for (auto * sub_buffer : sub_buffers) {
-            clReleaseMemObject(sub_buffer);
-        }
-    }
-
-    cl_mem buffer;
-    std::vector<cl_mem> sub_buffers;
-};
-
-static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000;
-
-static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    delete ctx;
-}
-
-static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    return cl_ptr_base;
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        tensor->extra = tensor->view_src->extra;
-    } else {
-        ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-        cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)};
-        cl_int err;
-        cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, &region, &err);
-        CL_CHECK(err);
-        ctx->sub_buffers.push_back(sub_buffer);
-        tensor->extra = sub_buffer;
-    }
-    tensor->backend = GGML_BACKEND_GPU;
-}
-
-static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
-    cl_mem tensor_buffer = (cl_mem) tensor->extra;
-    CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL));
-    CL_CHECK(clFinish(queue));
-}
-
-static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) {
-    ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context;
-    for (auto * sub_buffer : ctx->sub_buffers) {
-        clReleaseMemObject(sub_buffer);
-    }
-    ctx->sub_buffers.clear();
-}
-
-static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = {
-    /* .get_name        = */ ggml_backend_opencl_buffer_get_name,
-    /* .free_buffer     = */ ggml_backend_opencl_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_opencl_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_opencl_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_opencl_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_opencl_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ ggml_backend_opencl_buffer_clear,
-    /* .reset           = */ ggml_backend_opencl_buffer_reset,
-};
-
-// buffer type
-
-static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) {
-    return "OpenCL";
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) {
-    ggml_cl_init();
-
-    cl_int err;
-    cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err);
-    if (err != CL_SUCCESS) {
-        fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0);
-        return nullptr;
-    }
-
-    ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}};
-
-    return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) {
-    // FIXME: not thread safe, device may not be initialized yet
-    static cl_uint alignment = -1;
-    if (alignment == (cl_uint)-1) {
-        ggml_cl_init();
-        clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
-    }
-    return alignment;
-
-    GGML_UNUSED(buffer_type);
-}
-
-static size_t ggml_backend_opencl_buffer_type_get_max_size(ggml_backend_buffer_type_t buffer_type) {
-    static size_t max_size = -1;
-    if (max_size == (size_t)-1) {
-        ggml_cl_init();
-        clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_size, NULL);
-    }
-    return max_size;
-}
-
-static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) {
-    //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend
-    return ggml_backend_is_cpu(backend);
-
-    GGML_UNUSED(buffer_type);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_opencl_buffer_type_name,
-    /* .alloc_buffer     = */ ggml_backend_opencl_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_opencl_buffer_type_get_alignment,
-    /* .get_max_size     = */ ggml_backend_opencl_buffer_type_get_max_size,
-    /* .get_alloc_size   = */ NULL,
-    /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
-    /* .is_host          = */ NULL,
-};
-
-
-ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() {
-    static ggml_backend_buffer_type buffer_type = {
-        /* .iface   = */ ggml_backend_opencl_buffer_type_interface,
-        /* .context = */ nullptr,
-    };
-
-    return &buffer_type;
-}
-
-#if 0
-// host buffer type
-
-static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    return "CL_Host";
-
-    GGML_UNUSED(buft);
-}
-
-static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) {
-    return "CL_Host";
-
-    GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cl_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cl_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.get_name = ggml_backend_opencl_host_buffer_name;
-    buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = {
-        /* .iface    = */ {
-            /* .get_name         = */ ggml_backend_opencl_host_buffer_type_name,
-            /* .alloc_buffer     = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_opencl_buffer_type_host;
-}
-
-// backend
-
-static const char * ggml_backend_opencl_name(ggml_backend_t backend) {
-    return "OpenCL";
-
-    GGML_UNUSED(backend);
-}
-
-static void ggml_backend_opencl_free(ggml_backend_t backend) {
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) {
-    return ggml_backend_opencl_buffer_type();
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) {
-    for (int i = 0; i < graph->n_nodes; ++i) {
-        ggml_tensor * node = graph->nodes[i];
-        switch (node->op) {
-            case GGML_OP_MUL_MAT:
-                ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0);
-                break;
-            case GGML_OP_MUL:
-                ggml_cl_mul(node->src[0], node->src[1], node);
-                break;
-            default:
-                GGML_ASSERT(false);
-        }
-    }
-
-    return true;
-
-    GGML_UNUSED(backend);
-}
-
-static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_MUL_MAT:
-            return ggml_cl_can_mul_mat(op->src[0], op->src[1], op);
-        case GGML_OP_MUL:
-            // return ggml_can_repeat_rows(op->src[1], op->src[0]);
-            return true;
-        default:
-            return false;
-    }
-
-    GGML_UNUSED(backend);
-}
-
-static ggml_backend_i opencl_backend_i = {
-    /* .get_name                = */ ggml_backend_opencl_name,
-    /* .free                    = */ ggml_backend_opencl_free,
-    /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type,
-    /* .set_tensor_async        = */ NULL,
-    /* .get_tensor_async        = */ NULL,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ NULL,
-    /* .graph_plan_create       = */ NULL,
-    /* .graph_plan_free         = */ NULL,
-    /* .graph_plan_compute      = */ NULL,
-    /* .graph_compute           = */ ggml_backend_opencl_graph_compute,
-    /* .supports_op             = */ ggml_backend_opencl_supports_op,
-};
-
-ggml_backend_t ggml_backend_opencl_init() {
-    ggml_backend_t backend = new ggml_backend {
-        /* .interface = */ opencl_backend_i,
-        /* .context   = */ nullptr
-    };
-
-    return backend;
-}
-
-bool ggml_backend_is_opencl(ggml_backend_t backend) {
-    return backend && backend->iface.get_name == ggml_backend_opencl_name;
-}
-#endif
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -1,35 +1,24 @@
 #pragma once

 #include "ggml.h"
-#include "ggml-backend.h"

 #ifdef  __cplusplus
 extern "C" {
 #endif

-GGML_API void ggml_cl_init(void);
+void ggml_cl_init(void);

-GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
-GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);

-// GGML_API void * ggml_cl_host_malloc(size_t size);
-// GGML_API void   ggml_cl_host_free(void * ptr);
+void * ggml_cl_host_malloc(size_t size);
+void   ggml_cl_host_free(void * ptr);

-GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
+void ggml_cl_free_data(const struct ggml_tensor* tensor);

-GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
-
-// backend API
-
-// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
-
-// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
-// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
+void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);

 #ifdef  __cplusplus
 }
--- a/ggml-quants.c
+++ b/ggml-quants.c
--- a/ggml-quants.h
+++ b/ggml-quants.h
@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
+// Effectively 2.5625 bits per weight
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
@ -165,123 +165,60 @@ typedef struct {
 } block_q8_K;
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");

-// (Almost) "true" 2-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 2.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-} block_iq2_xxs;
-static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
-
-// 2.3125 bpw quants
-typedef struct {
-    ggml_fp16_t d;
-    uint16_t qs[QK_K/8];
-    uint8_t  scales[QK_K/32];
-} block_iq2_xs;
-static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
-
-// (Almost) "true" 3-bit quantization.
-// Due to the need to use blocks as per ggml design, it ends up using
-// 3.0625 bpw because of the 16-bit scale for each block of 256.
-typedef struct {
-    ggml_fp16_t d;
-    uint8_t qs[3*QK_K/8];
-} block_iq3_xxs;
-static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
-
-#ifdef __cplusplus
-extern "C" {
-#endif

 // Quantization
-void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k);
-void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k);
-void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k);
-void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k);
-void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k);
-void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k);
+void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
+void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
+void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
+void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
+void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
+void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);

-void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k);
-void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k);
-void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k);
-void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
-void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
-void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
-void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
+void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
+void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
+void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
+void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
+void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);

-void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);

-void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
-void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
+void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);

 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);

-void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
+void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
+void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
+void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);

 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-//
-// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
-//
-size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q2_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q3_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q6_K   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q4_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_0   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-size_t quantize_q5_1   (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-
-void iq2xs_init_impl(int grid_size);
-void iq2xs_free_impl(int grid_size);
-void iq3xs_init_impl(int grid_size);
-void iq3xs_free_impl(int grid_size);
-
-#ifdef __cplusplus
-}
-#endif
+void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);

+void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@ -1,29 +0,0 @@
-//
-//  MIT license
-//  Copyright (C) 2024 Intel Corporation
-//  SPDX-License-Identifier: MIT
-//
-
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_SYCL_MAX_DEVICES       16
-#define GGML_SYCL_NAME "SYCL"
-
-GGML_API void   ggml_init_sycl(void);
-GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
-GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
--- a/ggml-vulkan.h
+++ b/ggml-vulkan.h
@ -1,39 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_VK_NAME "Vulkan"
-#define GGML_VK_MAX_DEVICES 16
-
-GGML_API void ggml_vk_init_cpu_assist(void);
-
-GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
-GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
-GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#endif
-GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
-GGML_API void ggml_vk_free_cpu_assist(void);
-
-// backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
-
-GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
-
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
-// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -187,16 +187,6 @@
 #    define GGML_API
 #endif

-#ifdef GGML_MULTIPLATFORM
-#    if defined(_WIN32)
-#        define GGML_CALL
-#    else
-#        define GGML_CALL __attribute__((__ms_abi__))
-#    endif
-#else
-#    define GGML_CALL
-#endif
-
 // TODO: support for clang
 #ifdef __GNUC__
 #    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@ -225,12 +215,10 @@
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this

 #define GGML_MAX_DIMS           4
-#define GGML_MAX_PARAMS         2048
+#define GGML_MAX_PARAMS         1024
 #define GGML_MAX_CONTEXTS       64
-#define GGML_MAX_SRC            10
-#ifndef GGML_MAX_NAME
+#define GGML_MAX_SRC            6
 #define GGML_MAX_NAME           64
-#endif
 #define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
@ -256,10 +244,11 @@
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
-            fflush(stdout); \
            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            fflush(stderr); \
+            fflush(stdout); \
            ggml_print_backtrace(); \
-            abort(); \
+            exit(1); \
        } \
    } while (0)

@ -267,8 +256,6 @@
 #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
 #elif defined(__GNUC__)
 #define GGML_UNREACHABLE() __builtin_unreachable()
-#elif defined(_MSC_VER)
-#define GGML_UNREACHABLE() __assume(0)
 #else
 #define GGML_UNREACHABLE() ((void) 0)
 #endif
@ -297,27 +284,13 @@
    const type prefix##3 = (pointer)->array[3]; \
    GGML_UNUSED(prefix##3);

-#define GGML_TENSOR_UNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define GGML_TENSOR_BINARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
 #ifdef  __cplusplus
 extern "C" {
 #endif

 #if defined(__ARM_NEON) && defined(__CUDACC__)
    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+#elif defined(__ARM_NEON)
    typedef __fp16 ggml_fp16_t;
 #else
    typedef uint16_t ggml_fp16_t;
@ -351,21 +324,12 @@ extern "C" {
        GGML_TYPE_Q5_K = 13,
        GGML_TYPE_Q6_K = 14,
        GGML_TYPE_Q8_K = 15,
-        GGML_TYPE_IQ2_XXS = 16,
-        GGML_TYPE_IQ2_XS  = 17,
-        GGML_TYPE_IQ3_XXS = 18,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
        GGML_TYPE_COUNT,
    };

-    // precision
-    enum ggml_prec {
-        GGML_PREC_DEFAULT,
-        GGML_PREC_F32,
-    };
-
    enum ggml_backend_type {
        GGML_BACKEND_CPU = 0,
        GGML_BACKEND_GPU = 10,
@ -388,9 +352,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
-        GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
-        GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
-        GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
    };

    // available tensor operations:
@ -421,7 +382,6 @@ extern "C" {
        GGML_OP_GROUP_NORM,

        GGML_OP_MUL_MAT,
-        GGML_OP_MUL_MAT_ID,
        GGML_OP_OUT_PROD,

        GGML_OP_SCALE,
@ -448,10 +408,8 @@ extern "C" {
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
+
        GGML_OP_UPSCALE, // nearest interpolate
-        GGML_OP_PAD,
-        GGML_OP_ARGSORT,
-        GGML_OP_LEAKY_RELU,

        GGML_OP_FLASH_ATTN,
        GGML_OP_FLASH_FF,
@ -491,10 +449,7 @@ extern "C" {
        GGML_UNARY_OP_GELU,
        GGML_UNARY_OP_GELU_QUICK,
        GGML_UNARY_OP_SILU,
-        GGML_UNARY_OP_HARDSWISH,
-        GGML_UNARY_OP_HARDSIGMOID,
-
-        GGML_UNARY_OP_COUNT,
+        GGML_UNARY_OP_LEAKY
    };

    enum ggml_object_type {
@ -505,15 +460,8 @@ extern "C" {

    enum ggml_log_level {
        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_INFO  = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
-    };
-
-    enum ggml_tensor_flag {
-        GGML_TENSOR_FLAG_INPUT  = 1,
-        GGML_TENSOR_FLAG_OUTPUT = 2,
-        GGML_TENSOR_FLAG_PARAM  = 4,
+        GGML_LOG_LEVEL_WARN = 3,
+        GGML_LOG_LEVEL_INFO = 4
    };

    // ggml object
@ -537,6 +485,7 @@ extern "C" {

        struct ggml_backend_buffer * buffer;

+        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
                                   // nb[0] = ggml_type_size(type)
@ -549,7 +498,7 @@ extern "C" {
        // op params - allocated as int32_t for alignment
        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];

-        int32_t flags;
+        bool is_param;

        struct ggml_tensor * grad;
        struct ggml_tensor * src[GGML_MAX_SRC];
@ -568,16 +517,11 @@ extern "C" {

        void * extra; // extra things e.g. for ggml-cuda.cu

-        char padding[8];
+        char padding[12];
    };

    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);

-    // Abort callback
-    // If not NULL, called before ggml computation
-    // If it returns true, the computation is aborted
-    typedef bool (*ggml_abort_callback)(void * data);
-
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -587,8 +531,8 @@ extern "C" {
        int n_threads;

        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
    };

    enum ggml_cgraph_eval_order {
@ -674,41 +618,30 @@ extern "C" {
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);

-    GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API size_t  ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);

-    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
-    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API int     ggml_blck_size (enum ggml_type type);
+    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float

-    GGML_DEPRECATED(
-    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
-    "use ggml_row_size() instead");
+    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);

-    GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
-    GGML_API GGML_CALL const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API           const char * ggml_op_symbol(enum ggml_op   op);
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

-    GGML_API           const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
-
-    GGML_API GGML_CALL size_t  ggml_element_size(const struct ggml_tensor * tensor);
-
-    GGML_API GGML_CALL bool    ggml_is_quantized(enum ggml_type type);
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);

    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);

-    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);

    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);

@ -769,8 +702,8 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);

    // Context tensor enumeration and lookup
-    GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
-    GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
+    GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);

    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
@ -795,7 +728,7 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);

-    GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);

    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
@ -841,9 +774,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    // dst = a
-    // view(dst, nb1, nb2, nb3, offset) += b
-    // return dst
    GGML_API struct ggml_tensor * ggml_acc(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1008,14 +938,15 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    GGML_API struct ggml_tensor * ggml_leaky_relu(
+    GGML_API struct ggml_tensor * ggml_leaky(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a, float negative_slope, bool inplace);
+            struct ggml_tensor  * a);

    GGML_API struct ggml_tensor * ggml_relu_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // TODO: double-check this computation is correct
    GGML_API struct ggml_tensor * ggml_gelu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
@ -1047,16 +978,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    // hardswish(x) = x * relu6(x + 3) / 6
-    GGML_API struct ggml_tensor * ggml_hardswish(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // hardsigmoid(x) = relu6(x + 3) / 6
-    GGML_API struct ggml_tensor * ggml_hardsigmoid(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@ -1107,22 +1028,6 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    // change the precision of a matrix multiplication
-    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
-    GGML_API void ggml_mul_mat_set_prec(
-            struct ggml_tensor * a,
-            enum ggml_prec       prec);
-
-    // indirect matrix multiplication
-    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
-    GGML_API struct ggml_tensor * ggml_mul_mat_id(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * const as[],
-            int                   n_as,
-            struct ggml_tensor  * ids,
-            int                   id,
-            struct ggml_tensor  * b);
-
    // A: m columns, n rows,
    // B: p columns, n rows,
    // result is m columns, p rows
@ -1138,13 +1043,13 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            float                 s);
+            struct ggml_tensor  * b);

    // in-place, returns view(a)
    GGML_API struct ggml_tensor * ggml_scale_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            float                 s);
+            struct ggml_tensor  * b);

    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set(
@ -1200,16 +1105,22 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-    GGML_API struct ggml_tensor * ggml_cast(
+    // a -> b, in-place, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            enum   ggml_type      type);
+            struct ggml_tensor  * b);

    // make contiguous
    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // make contiguous, in-place
+    GGML_API struct ggml_tensor * ggml_cont_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
    // make contiguous, with new shape
    GGML_API struct ggml_tensor * ggml_cont_1d(
            struct ggml_context * ctx,
@ -1324,7 +1235,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // supports 3D: a->ne[2] == b->ne[1]
    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1373,14 +1283,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-    // fused soft_max(a*scale + mask)
-    // mask is optional
-    GGML_API struct ggml_tensor * ggml_soft_max_ext(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * mask,
-            float                 scale);
-
    GGML_API struct ggml_tensor * ggml_soft_max_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1448,7 +1350,7 @@ extern "C" {
            float                 beta_slow);

    // compute correction dims for YaRN RoPE scaling
-    GGML_CALL void ggml_rope_yarn_corr_dims(
+    void ggml_rope_yarn_corr_dims(
        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);

    // xPos RoPE, in-place, returns view(a)
@ -1506,19 +1408,7 @@ extern "C" {
            int                  p1,
            int                  d0,
            int                  d1,
-            bool                 is_2D,
-            enum ggml_type       dst_type);
-
-    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1);
+            bool                 is_2D);

    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
@ -1623,32 +1513,6 @@ extern "C" {
            struct ggml_tensor  * a,
            int                   scale_factor);

-    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
-    GGML_API struct ggml_tensor * ggml_pad(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                  p0,
-            int                  p1,
-            int                  p2,
-            int                  p3);
-
-    // sort rows
-    enum ggml_sort_order {
-        GGML_SORT_ASC,
-        GGML_SORT_DESC,
-    };
-
-    GGML_API struct ggml_tensor * ggml_argsort(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_sort_order  order);
-
-    // top k elements per row
-    GGML_API struct ggml_tensor * ggml_top_k(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   k);
-
    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
@ -1710,6 +1574,7 @@ extern "C" {
            int                   kh);

    // used in sam
+
    GGML_API struct ggml_tensor * ggml_add_rel_pos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1884,7 +1749,7 @@ extern "C" {
    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
+    GGML_API struct ggml_cgraph * ggml_graph_view        (struct ggml_context * ctx, struct ggml_cgraph * cgraph, int i0, int i1);
    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
@ -1894,8 +1759,8 @@ extern "C" {

    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API int               ggml_graph_compute(      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API int               ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);

    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@ -2098,28 +1963,10 @@ extern "C" {
            ggml_opt_callback callback,
            void * callback_data);

-    //
-    // tensor flags
-    //
-    GGML_API void ggml_set_input(struct ggml_tensor * tensor);
-    GGML_API void ggml_set_output(struct ggml_tensor * tensor);
-
    //
    // quantization
    //

-    // - ggml_quantize_init can be called multiple times with the same type
-    //   it will only initialize the quantization tables for the first call or after ggml_quantize_free
-    //   automatically called by ggml_quantize_chunk for convenience
-    //
-    // - ggml_quantize_free will free any memory allocated by ggml_quantize_init
-    //   call this at the end of the program to avoid memory leaks
-    //
-    // note: these are thread-safe
-    //
-    GGML_API void ggml_quantize_init(enum ggml_type type);
-    GGML_API void ggml_quantize_free(void);
-
    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
@ -2133,12 +1980,7 @@ extern "C" {
    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);

-    // some quantization type cannot be used without an importance matrix
-    GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
-
-    // calls ggml_quantize_init internally (i.e. can allocate memory)
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
-            int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

    //
    // gguf
@ -2203,16 +2045,14 @@ extern "C" {
    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);

-    GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
-    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);
+    GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
+    GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+    GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);

    // overrides existing values or adds a new one
    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
@ -2268,7 +2108,6 @@ extern "C" {
    //

    GGML_API int ggml_cpu_has_avx        (void);
-    GGML_API int ggml_cpu_has_avx_vnni   (void);
    GGML_API int ggml_cpu_has_avx2       (void);
    GGML_API int ggml_cpu_has_avx512     (void);
    GGML_API int ggml_cpu_has_avx512_vbmi(void);
@ -2283,14 +2122,10 @@ extern "C" {
    GGML_API int ggml_cpu_has_blas       (void);
    GGML_API int ggml_cpu_has_cublas     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
-    GGML_API int ggml_cpu_has_vulkan     (void);
-    GGML_API int ggml_cpu_has_kompute    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
-    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_vsx        (void);
-    GGML_API int ggml_cpu_has_matmul_int8(void);

    //
    // Internal types and functions exposed for tests and benchmarks
@ -2304,8 +2139,7 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
-    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                      const void * GGML_RESTRICT y, size_t by, int nrc);
+    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);

    typedef struct {
        const char      * type_name;
@ -2317,7 +2151,6 @@ extern "C" {
        ggml_from_float_t from_float_reference;
        ggml_vec_dot_t    vec_dot;
        enum ggml_type    vec_dot_type;
-        int64_t           nrows; // number of rows to process simultaneously;
    } ggml_type_traits_t;

    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/models/README.md
+++ b/models/README.md
@ -1,16 +1,19 @@
-## Whisper model files in custom `ggml` format
+## Whisper model files in custom ggml format

-The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L30)
+The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27)
 are converted to custom `ggml` format in order to be able to load them in C/C++.
 Conversion is performed using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script.

-There are three ways to obtain `ggml` models:
+You can either obtain the original models and generate the `ggml` files yourself using the conversion script,
+or you can use the [download-ggml-model.sh](download-ggml-model.sh) script to download the already converted models.
+Currently, they are hosted on the following locations:

-### 1. Use [download-ggml-model.sh](download-ggml-model.sh) to download pre-converted models
+- https://huggingface.co/ggerganov/whisper.cpp
+- https://ggml.ggerganov.com

-Example download:
+Sample download:

-```text
+```java
 $ ./download-ggml-model.sh base.en
 Downloading ggml model base.en ...
 models/ggml-base.en.bin          100%[=============================================>] 141.11M  5.41MB/s    in 22s
@ -20,46 +23,35 @@ You can now use it like this:
  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
 ```

-### 2. Manually download pre-converted models
-
-`ggml` models are available from the following locations:
-
- https://huggingface.co/ggerganov/whisper.cpp/tree/main
- https://ggml.ggerganov.com
-
-### 3. Convert with [convert-pt-to-ggml.py](convert-pt-to-ggml.py)
-
-Download one of the [models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L30) and generate the `ggml` files using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script.
-
-Example conversion, assuming the original PyTorch files have been downloaded into `~/.cache/whisper`. Change `~/path/to/repo/whisper/` to the location for your copy of the Whisper source:
-
-```bash
+To convert the files yourself, use the convert-pt-to-ggml.py script. Here is an example usage.
+The original PyTorch files are assumed to have been downloaded into ~/.cache/whisper
+Change `~/path/to/repo/whisper/` to the location for your copy of the Whisper source:
+```
 mkdir models/whisper-medium
 python models/convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
 mv ./models/whisper-medium/ggml-model.bin models/ggml-medium.bin
 rmdir models/whisper-medium
 ```

+A third option to obtain the model files is to download them from Hugging Face:
+
+https://huggingface.co/ggerganov/whisper.cpp/tree/main
+
 ## Available models

-| Model         | Disk    | SHA                                        |
-| ------------- | ------- | ------------------------------------------ |
-| tiny          | 75 MiB  | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
-| tiny.en       | 75 MiB  | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
-| base          | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
-| base.en       | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
-| small         | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
-| small.en      | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
-| small.en-tdrz | 465 MiB | `b6c6e7e89af1a35c08e6de56b66ca6a02a2fdfa1` |
-| medium        | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| medium.en     | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
-| large-v1      | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
-| large-v2      | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
-| large-v2-q5_0 | 1.1 GiB | `00e39f2196344e901b3a2bd5814807a769bd1630` |
-| large-v3      | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
-| large-v3-q5_0 | 1.1 GiB | `e6e2ed78495d403bef4b7cff42ef4aaadcfea8de` |
-
-Models are multilingual unless the model name includes `.en`. Models ending in `-q5_0` are [quantized](../README.md#quantization). Models ending in `-tdrz` support local diarization (marking of speaker turns) using [tinydiarize](https://github.com/akashmjn/tinydiarize). More information about models is available [upstream (openai/whisper)](https://github.com/openai/whisper#available-models-and-languages). The list above is a subset of the models supported by the [download-ggml-model.sh](download-ggml-model.sh) script, but many more are available at https://huggingface.co/ggerganov/whisper.cpp/tree/main and elsewhere.
+| Model     | Disk    | SHA                                        |
+| ---       | ---     | ---                                        |
+| tiny      |  75 MiB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
+| tiny.en   |  75 MiB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
+| base      | 142 MiB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
+| base.en   | 142 MiB | `137c40403d78fd54d454da0f9bd998f78703390c` |
+| small     | 466 MiB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
+| small.en  | 466 MiB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
+| medium    | 1.5 GiB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
+| medium.en | 1.5 GiB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
+| large-v1  | 2.9 GiB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
+| large-v2  | 2.9 GiB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large-v3  | 2.9 GiB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |

 ## Model files for testing purposes

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	15c4fdce45	chess : tuning performance	2023-11-30 10:50:47 +02:00
Fraxy V	70741ba794	wchess: c++17 -> c++11	2023-11-30 08:37:54 +02:00
Fraxy V	bb723282cc	wchess: off/on prompt	2023-11-30 01:17:29 +02:00
Fraxy V	dc5513a709	wchess: prompt	2023-11-29 19:30:57 +02:00
Fraxy V	ffc244845b	wchess : dynamic grammar	2023-11-29 18:53:28 +02:00
Fraxy V	8962a6bd67	wchess: preparing dyn grammar	2023-11-29 15:29:16 +02:00
Fraxy V	d313034b9c	wchess grammar tweaks	2023-11-29 09:25:45 +02:00
Fraxy V	8b0b0acff3	wchess : remove vad	2023-11-28 19:03:17 +02:00
Fraxy V	02ade14f67	wchess minor	2023-11-28 16:21:46 +02:00
fraxy-v	8dba8204eb	Merge pull request #1 from ggerganov/gg/wchess wchess : add clear_audio callback	2023-11-28 15:45:17 +02:00
Georgi Gerganov	4260d4fc70	wchess : minor	2023-11-28 15:10:18 +02:00
Georgi Gerganov	ee65df7982	wchess : add clear_audio callback	2023-11-28 13:37:26 +02:00
Fraxy V	03f254193b	wchess: hardcoded rules	2023-11-27 10:51:20 +02:00
Fraxy V	8f2d8eae10	wchess: basic chess rules	2023-11-27 10:41:04 +02:00
Fraxy V	a44b21bce0	wchess: tidy up entry files	2023-11-25 11:34:06 +02:00
Fraxy V	f07ff2aa6a	chess -> wchess	2023-11-25 10:16:48 +02:00
Fraxy V	280e631bcf	chess.wasm: poc of chess rules	2023-11-23 16:09:00 +02:00
Fraxy V	2f86da0d09	chess.wasm: add chessboard	2023-11-23 08:49:47 +02:00
Fraxy V	a787f7f85c	chess.wasm: encoder context value resulting in echoing	2023-11-21 20:42:20 +02:00
Fraxy V	c83a38e89d	chess.wasm: go back to greedy	2023-11-21 16:56:22 +02:00
Fraxy V	758c951729	chess.wasm: grammar in emscripten	2023-11-21 16:30:44 +02:00