ggml : disable CUDA graphs for non-llama.cpp projects

2025-07-05 08:51:14 +02:00 · 2024-06-26 20:14:22 +03:00
422 changed files with 224784 additions and 103134 deletions
--- a/.devops/cublas.Dockerfile
+++ b/.devops/cublas.Dockerfile
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
-    apt-get install -y build-essential git cmake libsdl2-dev
+    apt-get install -y build-essential git cmake
 WORKDIR /app
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -17,7 +17,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 ENV GGML_CUDA=1
 RUN apt-get update && \
-    apt-get install -y build-essential libsdl2-dev \
+    apt-get install -y build-essential \
    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 # Ref: https://stackoverflow.com/a/53464012
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -12,7 +12,7 @@ FROM ubuntu:22.04 AS runtime
 WORKDIR /app
 RUN apt-get update && \
-  apt-get install -y curl ffmpeg libsdl2-dev \
+  apt-get install -y curl ffmpeg \
  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 COPY --from=build /app /app
--- a/.github/workflows/bindings-go.yml
+++ b/.github/workflows/bindings-go.yml
@ -13,10 +13,10 @@ jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/setup-go@v5
+      - uses: actions/setup-go@v3
        with:
-          go-version: '^1.23'
+          go-version: '^1.19'
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v1
      - run: |
          cd bindings/go
          make test
--- a/.github/workflows/bindings-ruby.yml
+++ b/.github/workflows/bindings-ruby.yml
@ -3,73 +3,20 @@ on:
  push:
    paths:
      - bindings/ruby/**
-      - src/whisper.cpp
+      - whisper.h
      - include/whisper.h
      - ggml/src/ggml.c
      - ggml/src/ggml-impl.h
      - ggml/src/ggml-aarch64.h
      - ggml/src/ggml-aarch64.c
      - ggml/src/ggml-alloc.c
      - ggml/src/ggml-backend-impl.h
      - ggml/src/ggml-backend.cpp
      - ggml/src/ggml-common.h
      - ggml/src/ggml-quants.h
      - ggml/src/ggml-quants.c
      - ggml/src/ggml-cpu-impl.h
      - ggml/src/ggml-metal.m
      - ggml/src/ggml-metal.metal
      - ggml/src/ggml-blas.cpp
      - ggml/include/ggml.h
      - ggml/include/ggml-alloc.h
      - ggml/include/ggml-backend.h
      - ggml/include/ggml-cuda.h
      - ggml/include/ggml-kompute.h
      - ggml/include/ggml-metal.h
      - ggml/include/ggml-sycl.h
      - ggml/include/ggml-vulkan.h
      - ggml/include/ggml-blas.h
      - scripts/get-flags.mk
      - examples/dr_wav.h
  pull_request:
    paths:
      - bindings/ruby/**
-      - src/whisper.cpp
+      - whisper.h
      - include/whisper.h
      - ggml/src/ggml.c
      - ggml/src/ggml-impl.h
      - ggml/src/ggml-aarch64.h
      - ggml/src/ggml-aarch64.c
      - ggml/src/ggml-alloc.c
      - ggml/src/ggml-backend-impl.h
      - ggml/src/ggml-backend.cpp
      - ggml/src/ggml-common.h
      - ggml/src/ggml-quants.h
      - ggml/src/ggml-quants.c
      - ggml/src/ggml-cpu-impl.h
      - ggml/src/ggml-metal.m
      - ggml/src/ggml-metal.metal
      - ggml/src/ggml-blas.cpp
      - ggml/include/ggml.h
      - ggml/include/ggml-alloc.h
      - ggml/include/ggml-backend.h
      - ggml/include/ggml-cuda.h
      - ggml/include/ggml-kompute.h
      - ggml/include/ggml-metal.h
      - ggml/include/ggml-sycl.h
      - ggml/include/ggml-vulkan.h
      - ggml/include/ggml-blas.h
      - scripts/get-flags.mk
      - examples/dr_wav.h
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    defaults:
      run:
        working-directory: bindings/ruby
    steps:
      - uses: ruby/setup-ruby@v1
        with:
          ruby-version: '3.0'
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v1
-      - run: rake test
+      - run: |
          cd bindings/ruby/ext
          ruby extconf.rb && make
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -3,7 +3,6 @@ on: [push, pull_request]
 env:
  ubuntu_image: "ubuntu:22.04"
  VCPKG_BINARY_SOURCES: "clear;x-gha,readwrite"
 jobs:
  ubuntu-latest:
@ -60,7 +59,7 @@ jobs:
        uses: cross-platform-actions/action@v0.24.0
        with:
          operating_system: freebsd
-          version: '13.3'
+          version: '13.2'
          run: |
            sudo pkg update
            sudo pkg install -y gmake sdl2
@ -309,7 +308,7 @@ jobs:
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
-            cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+            cmake -B build -DGGML_OPENBLAS=ON
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
  windows:
@ -383,8 +382,10 @@ jobs:
        sdl2: [ON]
        include:
          - arch: Win32
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
            s2arc: x86
          - arch: x64
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
            s2arc: x64
          - sdl2: ON
            s2ver: 2.28.5
@ -393,21 +394,17 @@ jobs:
      - name: Clone
        uses: actions/checkout@v4
      - name: Export GitHub Actions cache environment variables
        uses: actions/github-script@v7
        with:
          script: |
            core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
            core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v2
-      - name: Install OpenBLAS and pkgconfiglite
+      - name: Fetch OpenBLAS
        if: matrix.blas == 'ON'
        run: |
-          vcpkg install --triplet=${{ matrix.s2arc }}-windows openblas
+          C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
-          choco install pkgconfiglite
+          7z x blas.zip -oblas -y
          copy blas/include/cblas.h .
          copy blas/include/openblas_config.h .
          echo "OPENBLAS_PATH=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
      - name: Fetch SDL2 and set SDL2_DIR
        if: matrix.sdl2 == 'ON'
@ -419,10 +416,9 @@ jobs:
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake"
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DGGML_BLAS=${{ matrix.blas }}
+          -DGGML_OPENBLAS=${{ matrix.blas }}
-          -DGGML_BLAS_VENDOR=OpenBLAS
+          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
      - name: Build
@ -430,9 +426,9 @@ jobs:
          cd ./build
          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
-      - name: Copy openblas.dll
+      - name: Copy libopenblas.dll
        if: matrix.blas == 'ON'
-        run: copy "C:/vcpkg/packages/openblas_${{ matrix.s2arc }}-windows/bin/openblas.dll" build/bin/${{ matrix.build }}
+        run: copy "$env:OPENBLAS_PATH/bin/libopenblas.dll" build/bin/${{ matrix.build }}
      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
@ -549,9 +545,8 @@ jobs:
          cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
          mkdir models/ggml-base.en-encoder.mlmodelc
-# TODO: disabled because it fails for some reason with Github Actions
+      - name: Build objc example
-#      - name: Build objc example
+        run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
 #        run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
      - name: Build swiftui example
        run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphonesimulator build
@ -565,6 +560,12 @@ jobs:
        with:
          path: whisper
      - name: Clone
        uses: actions/checkout@v4
        with:
          repository: ggerganov/ggml
          path: ggml
      - name: Install Java
        uses: actions/setup-java@v4
        with:
@ -583,77 +584,75 @@ jobs:
        run: |
          export PATH_TO_GGML=$PWD/ggml
          cd whisper/examples/whisper.android
-          ./gradlew assembleRelease --no-daemon
+          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
-# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
+  android_java:
-#  android_java:
+    runs-on: ubuntu-latest
 #    runs-on: ubuntu-latest
 #
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
 #
 #      - name: set up JDK 11
 #        uses: actions/setup-java@v4
 #        with:
 #          java-version: '11'
 #          distribution: 'temurin'
 #          cache: gradle
 #
 #      - name: Setup Android SDK
 #        uses: android-actions/setup-android@v3
 #        with:
 #          cmdline-tools-version: 9.0
 #
 #      - name: Build
 #        run: |
 #          cd examples/whisper.android.java
 #          chmod +x ./gradlew
 #          ./gradlew assembleRelease
-# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
+    steps:
-#  java:
+      - name: Clone
-#    needs: [ 'windows' ]
+        uses: actions/checkout@v4
-#    runs-on: windows-latest
+
-#    steps:
+      - name: set up JDK 11
-#      - uses: actions/checkout@v4
+        uses: actions/setup-java@v4
-#
+        with:
-#      - name: Install Java
+          java-version: '11'
-#        uses: actions/setup-java@v4
+          distribution: 'temurin'
-#        with:
+          cache: gradle
-#          distribution: zulu
+
-#          java-version: 20
+      - name: Setup Android SDK
-#
+        uses: android-actions/setup-android@v3
-#      - name: Download Windows lib
+        with:
-#        uses: actions/download-artifact@v4
+          cmdline-tools-version: 9.0
-#        with:
+
-#          name: win32-x86-64_whisper.dll
+      - name: Build
-#          path: bindings/java/build/generated/resources/main/win32-x86-64
+        run: |
-#
+          cd examples/whisper.android.java
-#      - name: Build
+          chmod +x ./gradlew
-#        run: |
+          ./gradlew assembleRelease
-#          models\download-ggml-model.cmd tiny.en
+
-#          cd bindings/java
+  java:
-#          chmod +x ./gradlew
+    needs: [ 'windows' ]
-#          ./gradlew build
+    runs-on: windows-latest
-#
+    steps:
-#      - name: Upload jar
+      - uses: actions/checkout@v4
-#        uses: actions/upload-artifact@v4
+
-#        with:
+      - name: Install Java
-#          name: whispercpp.jar
+        uses: actions/setup-java@v4
-#          path: bindings/java/build/libs/whispercpp-*.jar
+        with:
-#
+          distribution: zulu
-#      - name: Publish package
+          java-version: 20
-#        if: ${{ github.ref == 'refs/heads/master' }}
+
-#        uses: gradle/gradle-build-action@v2.4.2
+      - name: Download Windows lib
-#        with:
+        uses: actions/download-artifact@v4
-#          arguments: publish
+        with:
-#          build-root-directory: bindings/java
+          name: win32-x86-64_whisper.dll
-#        env:
+          path: bindings/java/build/generated/resources/main/win32-x86-64
-#          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
+
-#          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
+      - name: Build
-#          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
+        run: |
-#          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
+          models\download-ggml-model.cmd tiny.en
          cd bindings/java
          chmod +x ./gradlew
          ./gradlew build
      - name: Upload jar
        uses: actions/upload-artifact@v4
        with:
          name: whispercpp.jar
          path: bindings/java/build/libs/whispercpp-*.jar
      - name: Publish package
        if: ${{ github.ref == 'refs/heads/master' }}
        uses: gradle/gradle-build-action@v2.4.2
        with:
          arguments: publish
          build-root-directory: bindings/java
        env:
          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
  quantize:
    runs-on: ubuntu-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -18,9 +18,7 @@ jobs:
      matrix:
        config:
          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
-          #TODO: the cuda image keeps failing - disable for now
+          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
          #      https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
          #- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
    steps:
      - name: Check out the repo
@ -45,7 +43,7 @@ jobs:
        with:
          context: .
          push: true
-          platforms: ${{ matrix.config.platform }}
+          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
          file: ${{ matrix.config.dockerfile }}
@ -54,6 +52,6 @@ jobs:
        with:
          context: .
          push: ${{ github.event_name == 'push' }}
-          platforms: ${{ matrix.config.platform }}
+          platforms: ${{ matrix.config.platforms }}
          tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
          file: ${{ matrix.config.dockerfile }}
--- a/.gitignore
+++ b/.gitignore
@ -1,16 +1,13 @@
 *.o
 *.a
 *.d
 .cache/
 .coreml/
 .test/
 .venv/
 .vs/
 .vscode/
 .DS_Store
 .vimspector.json
 /CMakeSettings.json
 /talk-llama.dSYM/
 build/
 build-*/
@ -20,9 +17,6 @@ build-*/
 .swiftpm
 *.metallib
 ggml-metal-embed.metal
 ggml-metal-embed.metal.tmp
 /main
 /stream
 /command
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
 project("whisper.cpp" C CXX)
-project("whisper.cpp" VERSION 1.7.2)
+project("whisper.cpp" VERSION 1.6.2)
 include(CheckIncludeFileCXX)
 set(SOVERSION 1)
@ -120,10 +120,7 @@ whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
 # build the library
 #
-if (NOT TARGET ggml)
+add_subdirectory(ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 add_subdirectory(src)
 #
@ -164,6 +161,18 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
 install(
    FILES convert-hf-to-gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
        OWNER_EXECUTE
        GROUP_READ
        GROUP_EXECUTE
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
 configure_file(cmake/whisper.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
        @ONLY)
--- a/336
+++ b/336
@ -3,11 +3,12 @@ BUILD_TARGETS = \
 	main \
 	bench \
 	quantize \
-	server
+	server \
 	tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-c.o
+	tests/test-backend-ops
 # Deprecation aliases
 ifdef WHISPER_CUBLAS
@ -134,18 +135,14 @@ ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif
 ifdef GGML_VULKAN
 	BUILD_TARGETS += vulkan-shaders-gen
 endif
 ifeq ($(shell sdl2-config --cflags --libs 2>/dev/null),)
 else
 	BUILD_TARGETS += \
 		command \
 		stream \
 		lsp \
 		talk \
 		talk-llama
 	# talk (TODO: disalbed)
 endif
 default: $(BUILD_TARGETS)
@ -254,10 +251,7 @@ ifdef WHISPER_DEBUG
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
-	MK_CPPFLAGS   += -DNDEBUG
+	MK_CPPFLAGS += -DNDEBUG
 	MK_CFLAGS     += -O3
 	MK_CXXFLAGS   += -O3
 	MK_NVCCFLAGS  += -O3
 endif
 ifdef WHISPER_SANITIZE_THREAD
@ -444,17 +438,17 @@ endif
 else
 	MK_CFLAGS   += -march=rv64gcv -mabi=lp64d
 	MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
-endif # RISCV
+endif
 ifndef GGML_NO_ACCELERATE
 	# Mac OS - include Accelerate framework.
 	# `-framework Accelerate` works both with Apple Silicon and Mac Intel
 	ifeq ($(UNAME_S),Darwin)
-		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
+		MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
 		MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
 		MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
 		MK_LDFLAGS  += -framework Accelerate
-		OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+		OBJ_GGML    += ggml/src/ggml-blas.o
 	endif
 endif # GGML_NO_ACCELERATE
@ -464,38 +458,29 @@ ifndef GGML_NO_OPENMP
 	MK_CXXFLAGS += -fopenmp
 endif # GGML_NO_OPENMP
 ifdef WHISPER_COREML
 	MK_CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS     += -framework Foundation -framework CoreML
 ifdef WHISPER_COREML_ALLOW_FALLBACK
 	MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
 endif
 endif # WHISPER_COREML
 ifdef GGML_OPENBLAS
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas)
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_OPENBLAS
 ifdef GGML_OPENBLAS64
 	MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
 	MK_CFLAGS   += $(shell pkg-config --cflags-only-other openblas64)
 	MK_LDFLAGS  += $(shell pkg-config --libs openblas64)
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_OPENBLAS64
 ifdef GGML_BLIS
 	MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
 	MK_LDFLAGS  += -lblis -L/usr/local/lib
-	OBJ_GGML    += ggml/src/ggml-blas/ggml-blas.o
+	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_BLIS
 ifdef GGML_RPC
 	MK_CPPFLAGS += -DGGML_USE_RPC
-	OBJ_GGML    += ggml/src/ggml-rpc/ggml-rpc.o
+	OBJ_GGML    += ggml/src/ggml-rpc.o
 endif # GGML_RPC
 OBJ_CUDA_TMPL      = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
@ -516,15 +501,16 @@ ifdef GGML_CUDA
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	#MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-	#MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_NVCCFLAGS += -use_fast_math
-	OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
+	OBJ_GGML += ggml/src/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)
 	OBJ_WHISPER += src/whisper-mel-cuda.o
 ifdef WHISPER_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # WHISPER_FATAL_WARNINGS
@ -624,21 +610,25 @@ ggml/src/ggml-cuda/%.o: \
 	ggml/src/ggml-cuda/common.cuh
 	$(NVCC_COMPILE)
-ggml/src/ggml-cuda/ggml-cuda.o: \
+ggml/src/ggml-cuda.o: \
-	ggml/src/ggml-cuda/ggml-cuda.cu \
+	ggml/src/ggml-cuda.cu \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h \
 	ggml/include/ggml-cuda.h \
 	ggml/src/ggml-backend-impl.h \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
 src/whisper-mel-cuda.o: src/whisper-mel-cuda.cu src/whisper-mel-cuda.hpp
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # GGML_CUDA
 ifdef GGML_VULKAN
 	MK_CPPFLAGS += -DGGML_USE_VULKAN
-	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
+	MK_LDFLAGS  += -lvulkan
-	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
+	OBJ_GGML    += ggml/src/ggml-vulkan.o
 ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
@ -652,10 +642,6 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
 endif
 ifdef GGML_VULKAN_PERF
 	MK_CPPFLAGS  += -DGGML_VULKAN_PERF
 endif
 ifdef GGML_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
@ -664,28 +650,10 @@ ifdef GGML_VULKAN_RUN_TESTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
 endif
-GLSLC_CMD  = glslc
+ggml/src/ggml-vulkan.o: \
-_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
+	ggml/src/ggml-vulkan.cpp \
-_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
+	ggml/include/ggml-vulkan.h
-_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 _ggml_vk_input_dir = ggml/src/vulkan-shaders
 _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
 ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
 	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
 $(_ggml_vk_header): $(_ggml_vk_source)
 $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
 	$(_ggml_vk_genshaders_cmd) \
 		--glslc      $(GLSLC_CMD) \
 		--input-dir  $(_ggml_vk_input_dir) \
 		--target-hpp $(_ggml_vk_header) \
 		--target-cpp $(_ggml_vk_source)
 vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
 	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
 endif # GGML_VULKAN
 ifdef GGML_HIPBLAS
@ -751,43 +719,50 @@ endif # GGML_HIPBLAS
 ifdef GGML_METAL
 	MK_CPPFLAGS += -DGGML_USE_METAL
 	MK_LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
-	OBJ_GGML	+= ggml/src/ggml-metal/ggml-metal.o
+	OBJ_GGML	+= ggml/src/ggml-metal.o
 ifdef GGML_METAL_NDEBUG
 	MK_CPPFLAGS += -DGGML_METAL_NDEBUG
 endif
 ifdef GGML_METAL_EMBED_LIBRARY
 	MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
-	OBJ_GGML    += ggml/src/ggml-metal/ggml-metal-embed.o
+	OBJ_GGML    += ggml/src/ggml-metal-embed.o
 endif
 endif # GGML_METAL
 ifdef WHISPER_COREML
 	MK_CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS     += -framework Foundation -framework CoreML
 ifdef WHISPER_COREML_ALLOW_FALLBACK
 	MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
 endif
 endif
 # ===
 ifdef GGML_METAL
-ggml/src/ggml-metal/ggml-metal.o: \
+ggml/src/ggml-metal.o: \
-	ggml/src/ggml-metal/ggml-metal.m \
+	ggml/src/ggml-metal.m \
 	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/include/ggml-metal.h \
 	ggml/include/ggml.h
 	$(CC) $(CFLAGS) -c $< -o $@
 ifdef GGML_METAL_EMBED_LIBRARY
-ggml/src/ggml-metal/ggml-metal-embed.o: \
+ggml/src/ggml-metal-embed.o: \
-	ggml/src/ggml-metal/ggml-metal.metal \
+	ggml/src/ggml-metal.metal \
 	ggml/src/ggml-metal/ggml-metal-impl.h \
 	ggml/src/ggml-common.h
 	@echo "Embedding Metal library"
-	@sed -e '/__embed_ggml-common.h__/r      ggml/src/ggml-common.h'                -e '/__embed_ggml-common.h__/d'      < ggml/src/ggml-metal/ggml-metal.metal           > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
+	@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
-	@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
+	$(eval TEMP_ASSEMBLY=$(shell mktemp))
-	$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
+	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)
-	@echo ".section __DATA, __ggml_metallib"                       >  $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_start"                            >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_start:"                                  >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
-	@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)
-	@echo ".globl _ggml_metallib_end"                              >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)
-	@echo "_ggml_metallib_end:"                                    >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
+	@$(AS) $(TEMP_ASSEMBLY) -o $@
-	$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
+	@rm -f ${TEMP_ASSEMBLY}
 	@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
 	@rmdir ${TEMP_ASSEMBLY}
 endif
 endif # GGML_METAL
@ -803,17 +778,9 @@ endif
 OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-aarch64.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-backend-reg.o \
+	ggml/src/ggml-quants.o
 	ggml/src/ggml-opt.o \
 	ggml/src/ggml-quants.o \
 	ggml/src/ggml-threading.o \
 	ggml/src/ggml-cpu/ggml-cpu.o \
 	ggml/src/ggml-cpu/ggml-cpu-cpp.o \
 	ggml/src/ggml-cpu/ggml-cpu-aarch64.o \
 	ggml/src/ggml-cpu/ggml-cpu-quants.o
 OBJ_WHISPER += \
 	src/whisper.o
@ -918,64 +885,101 @@ endif
 # Build libraries
 #
-LIB_GGML   = libggml.so
+# ggml
 LIB_GGML_S = libggml.a
-LIB_LLAMA   = libllama.so
+ggml/src/ggml.o: \
-LIB_LLAMA_S = libllama.a
+	ggml/src/ggml.c \
 	ggml/include/ggml.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-LIB_COMMON   = libcommon.so
+ggml/src/ggml-alloc.o: \
-LIB_COMMON_S = libcommon.a
+	ggml/src/ggml-alloc.c \
 LIB_COMMON_SDL   = libcommon-sdl.so
 LIB_COMMON_SDL_S = libcommon-sdl.a
 # Targets
 BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
 # Dependency files
 DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
 # Default target
 all: $(BUILD_TARGETS)
 # Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
 #       g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
 ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
 	ggml/src/ggml-cpu/ggml-cpu.cpp \
 	ggml/include/ggml-backend.h \
 	ggml/include/ggml.h \
-	ggml/include/ggml-alloc.h \
+	ggml/include/ggml-alloc.h
-	ggml/src/ggml-backend-impl.h \
+	$(CC)  $(CFLAGS)   -c $< -o $@
 	ggml/include/ggml-cpu.h \
 	ggml/src/ggml-impl.h
 	$(CXX) $(CXXFLAGS)   -c $< -o $@
-# Rules for building object files
+ggml/src/ggml-backend.o: \
-ggml/%.o: ggml/%.c
+	ggml/src/ggml-backend.c \
-	$(CC) $(CFLAGS) -MMD -c $< -o $@
+	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
-ggml/%.o: ggml/%.cpp
+ggml/src/ggml-quants.o: \
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+	ggml/src/ggml-quants.c \
 	ggml/include/ggml.h \
 	ggml/src/ggml-quants.h \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
-src/%.o: src/%.cpp
+ggml/src/ggml-blas.o: \
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
-examples/%.o: examples/%.cpp
+ifdef GGML_LLAMAFILE
-	$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
+ggml/src/sgemm.o: \
 	ggml/src/sgemm.cpp \
 	ggml/src/sgemm.h \
 	ggml/include/ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_LLAMAFILE
-# Rules for building libraries
+ifdef GGML_RPC
-$(LIB_GGML): $(OBJ_GGML)
+ggml/src/ggml-rpc.o: \
 	ggml/src/ggml-rpc.cpp \
 	ggml/include/ggml-rpc.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 endif # GGML_RPC
 $(LIB_GGML): \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-$(LIB_GGML_S): $(OBJ_GGML)
+$(LIB_GGML_S): \
 	$(OBJ_GGML)
 	ar rcs $(LIB_GGML_S) $^
-$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
+# whisper
 src/whisper.o: \
 	src/whisper.cpp \
 	src/whisper-mel.hpp \
 	include/whisper.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
 	ggml/include/ggml-backend.h \
 	ggml/include/ggml-cuda.h \
 	ggml/include/ggml-metal.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 $(LIB_WHISPER): \
 	$(OBJ_WHISPER) \
 	$(LIB_GGML)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
-$(LIB_LLAMA_S): $(OBJ_LLAMA)
+$(LIB_WHISPER_S): \
-	ar rcs $(LIB_LLAMA_S) $^
+	$(OBJ_WHISPER)
 	ar rcs $(LIB_WHISPER_S) $^
 # common
 examples/common.o: \
 	examples/common.cpp \
 	examples/common.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 examples/common-ggml.o: \
 	examples/common-ggml.cpp \
 	examples/common-ggml.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 $(LIB_COMMON): \
 	$(OBJ_COMMON)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 $(LIB_COMMON_S): \
 	$(OBJ_COMMON)
 	ar rcs $(LIB_COMMON_S) $^
 # common-sdl
@ -987,21 +991,34 @@ examples/common-sdl.o: \
 	examples/common-sdl.h
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $@
-$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
+$(LIB_COMMON_SDL): \
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
+	$(OBJ_SDL)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(LDFLAGS_SDL)
-$(LIB_COMMON_S): $(OBJ_COMMON)
+$(LIB_COMMON_SDL_S): \
-	ar rcs $(LIB_COMMON_S) $^
+	$(OBJ_SDL)
 	ar rcs $(LIB_COMMON_SDL_S) $^
 # Include dependency files
 -include $(DEP_FILES)
 # Clean rule
 clean:
-	rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
+	rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
-	rm -rvf *.a *.dll *.so *.dot
+	rm -rvf src/*.o
-	find ggml src tests examples -type f -name "*.o" -delete
+	rm -rvf src/coreml/*.o
-	find ggml src tests examples -type f -name "*.d" -delete
+	rm -rvf tests/*.o
 	rm -rvf examples/*.o
 	rm -rvf *.a
 	rm -rvf *.dll
 	rm -rvf *.so
 	rm -rvf *.dot
 	rm -rvf ggml/*.a
 	rm -rvf ggml/*.dll
 	rm -rvf ggml/*.so
 	rm -vrf ggml/src/*.o
 	rm -vrf ggml/src/ggml-metal-embed.metal
 	rm -vrf ggml/src/ggml-cuda/*.o
 	rm -vrf ggml/src/ggml-cuda/template-instances/*.o
 	rm -rvf $(BUILD_TARGETS)
 	rm -rvf $(TEST_TARGETS)
 	find examples -type f -name "*.o" -delete
 #
 # Examples
@ -1018,6 +1035,9 @@ main: examples/main/main.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
 bench: examples/bench/bench.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
@ -1049,14 +1069,12 @@ lsp: examples/lsp/lsp.cpp \
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
-# TODO: disabled until update
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
-#       https://github.com/ggerganov/whisper.cpp/issues/1818
+	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
-#talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
+	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
-#	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
 #	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 #	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/llama-vocab.cpp examples/talk-llama/llama-grammar.cpp examples/talk-llama/llama-sampling.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
@ -1070,6 +1088,11 @@ tests: $(TEST_TARGETS)
 tests/test-c.o: tests/test-c.c include/whisper.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
 tests/test-backend-ops: tests/test-backend-ops.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 #
 # Audio samples
 #
@ -1115,9 +1138,8 @@ samples:
 .PHONY: large-v1
 .PHONY: large-v2
 .PHONY: large-v3
 .PHONY: large-v3-turbo
-tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
 	bash ./models/download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
--- a/Package.swift
+++ b/Package.swift
@ -18,40 +18,31 @@ let package = Package(
            name: "whisper",
            path: ".",
            exclude: [
               "build",
               "bindings",
               "cmake",
               "coreml",
               "examples",
-               "scripts",
+               "extra",
               "models",
               "samples",
               "tests",
               "CMakeLists.txt",
-               "Makefile",
+               "Makefile"
               "ggml/src/ggml-metal/ggml-metal-embed.metal"
            ],
            sources: [
                "ggml/src/ggml.c",
                "src/whisper.cpp",
                "ggml/src/ggml-aarch64.c",
                "ggml/src/ggml-alloc.c",
-                "ggml/src/ggml-backend.cpp",
+                "ggml/src/ggml-backend.c",
                "ggml/src/ggml-backend-reg.cpp",
                "ggml/src/ggml-cpu/ggml-cpu.c",
                "ggml/src/ggml-cpu/ggml-cpu.cpp",
                "ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
                "ggml/src/ggml-cpu/ggml-cpu-quants.c",
                "ggml/src/ggml-quants.c",
-                "ggml/src/ggml-threading.cpp",
+                "ggml/src/ggml-metal.m"
                "ggml/src/ggml-metal/ggml-metal.m"
            ],
-            resources: [.process("ggml/src/ggml-metal/ggml-metal.metal")],
+            resources: [.process("ggml-metal.metal")],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                .unsafeFlags(["-fno-objc-arc"]),
                .headerSearchPath("ggml/src"),
                .define("GGML_USE_ACCELERATE"),
                .unsafeFlags(["-fno-objc-arc"]),
                .define("GGML_USE_METAL")
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
--- a/README.md
+++ b/README.md
@ -7,23 +7,21 @@
 [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Stable: [v1.7.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.6.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.6.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
 - Plain C/C++ implementation without dependencies
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](#core-ml-support)
+- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
- [Integer quantization support](#quantization)
+- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
 - Zero memory allocations at runtime
 - [Vulkan support](#vulkan-gpu-support)
 - Support for CPU-only inference
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
+- [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [OpenVINO Support](#openvino-support)
+- [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
- [Ascend NPU Support](#ascend-npu-support)
+- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
 Supported platforms:
@ -35,9 +33,9 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
+- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
-The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
+The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
 The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
@ -57,8 +55,8 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 ## Implementation details
- The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c))
+- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp))
+- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
 - Sample usage is demonstrated in [main.cpp](examples/main)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder
@ -73,23 +71,17 @@ First clone the repository:
 git clone https://github.com/ggerganov/whisper.cpp.git
 ```
 Navigate into the directory:
 ```
 cd whisper.cpp
 ```
 Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
 ```bash
-sh ./models/download-ggml-model.sh base.en
+bash ./models/download-ggml-model.sh base.en
 ```
 Now build the [main](examples/main) example and transcribe an audio file like this:
 ```bash
 # build the main example
-make -j
+make
 # transcribe an audio file
 ./main -f samples/jfk.wav
@ -100,7 +92,7 @@ make -j
 For a quick demo, simply run `make base.en`:
 ```text
-$ make -j base.en
+$ make base.en
 cc  -I.              -O3 -std=c11   -pthread -DGGML_USE_ACCELERATE   -c ggml.c -o ggml.o
 c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
@ -153,7 +145,7 @@ options:
  -ng,       --no-gpu            [false  ] disable GPU
-sh ./models/download-ggml-model.sh base.en
+bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
 ggml-base.en.bin               100%[========================>] 141.11M  6.34MB/s    in 24s
 Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
@ -224,7 +216,7 @@ ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
 If you want some extra audio samples to play with, simply run:
 ```
-make -j samples
+make samples
 ```
 This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`.
@ -232,18 +224,17 @@ This will download a few more audio files from Wikipedia and convert them to 16-
 You can download and run the other models as follows:
 ```
-make -j tiny.en
+make tiny.en
-make -j tiny
+make tiny
-make -j base.en
+make base.en
-make -j base
+make base
-make -j small.en
+make small.en
-make -j small
+make small
-make -j medium.en
+make medium.en
-make -j medium
+make medium
-make -j large-v1
+make large-v1
-make -j large-v2
+make large-v2
-make -j large-v3
+make large-v3
 make -j large-v3-turbo
 ```
 ## Memory usage
@ -265,7 +256,7 @@ Here are the steps for creating and using a quantized model:
 ```bash
 # quantize a model with Q5_0 method
-make -j quantize
+make quantize
 ./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
 # run the examples as usual, specifying the quantized model file
@ -430,16 +421,6 @@ make clean
 GGML_CUDA=1 make -j
 ```
 ## Vulkan GPU support
 Cross-vendor solution which allows you to accelerate workload on your GPU.
 First, make sure your graphics card driver provides support for Vulkan API.
 Now build `whisper.cpp` with Vulkan support:
 ```
 make clean
 make GGML_VULKAN=1 -j
 ```
 ## BLAS CPU support via OpenBLAS
 Encoder processing can be accelerated on the CPU via OpenBLAS.
@ -467,39 +448,6 @@ cmake -DWHISPER_MKL=ON ..
 WHISPER_MKL=1 make -j
 ```
 ## Ascend NPU support
 Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores. 
 First, check if your Ascend NPU device is supported:
 **Verified devices**
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |
 Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
 Now build `whisper.cpp` with CANN support:
 ```
 mkdir build
 cd build
 cmake .. -D GGML_CANN=on
 make -j
 ```
 Run the inference examples as usual, for example:
 ```
 ./build/bin/main -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
 ```
 *Notes:*
 - If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
 - If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
 ## Docker
 ### Prerequisites
@ -636,7 +584,7 @@ The [stream](examples/stream) tool samples the audio every half a second and run
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
 ```bash
-make stream -j
+make stream
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```
@ -803,7 +751,7 @@ took to execute it. The results are summarized in the following Github issue:
 [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
-Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
+Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](bench.py).
 You can run it with the following command, by default it will run against any standard model in the models folder.
@ -850,7 +798,6 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
  - [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
 - [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 - [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -14,14 +14,9 @@ GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
-INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
+INCLUDE_PATH := $(abspath ../..)
 LIBRARY_PATH := $(abspath ../..)
 ifeq ($(GGML_CUDA),1)
 	LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
 	BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
 endif
 ifeq ($(UNAME_S),Darwin)
 	EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
 endif
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@ -62,12 +62,6 @@ This will compile a static `libwhisper.a` in a `build` folder, download a model
 make examples
 ```
 To build using cuda support add `GGML_CUDA=1`:
 ```bash
 GGML_CUDA=1 make examples
 ```
 The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
 ```bash
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -24,7 +24,7 @@ const (
 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"}
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
 )
 var (
--- a/bindings/go/go.mod
+++ b/bindings/go/go.mod
@ -1,10 +1,10 @@
 module github.com/ggerganov/whisper.cpp/bindings/go
-go 1.23
+go 1.19
 require (
 	github.com/go-audio/wav v1.1.0
-	github.com/stretchr/testify v1.9.0
+	github.com/stretchr/testify v1.8.1
 )
 require (
--- a/bindings/go/go.sum
+++ b/bindings/go/go.sum
@ -1,3 +1,4 @@
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@ -8,9 +9,15 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
 github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -119,28 +119,6 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }
 func (p *Params) SetMaxContext(n int) {
 	p.n_max_text_ctx = C.int(n)
 }
 func (p *Params) SetBeamSize(n int) {
 	p.beam_search.beam_size = C.int(n)
 }
 func (p *Params) SetEntropyThold(t float32) {
 	p.entropy_thold = C.float(t)
 }
 func (p *Params) SetTemperature(t float32) {
 	p.temperature = C.float(t)
 }
 // Sets the fallback temperature incrementation
 // Pass -1.0 to disable this feature
 func (p *Params) SetTemperatureFallback(t float32) {
 	p.temperature_inc = C.float(t)
 }
 // Set initial prompt
 func (p *Params) SetInitialPrompt(prompt string) {
 	p.initial_prompt = C.CString(prompt)
@ -171,10 +149,6 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
 	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
 	str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
 	str += fmt.Sprintf(" temperature=%f", p.temperature)
 	str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
 	str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
 	if p.translate {
 		str += " translate"
 	}
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -125,32 +125,6 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }
 // Set maximum number of text context tokens to store
 func (context *context) SetMaxContext(n int) {
 	context.params.SetMaxContext(n)
 }
 // Set Beam Size
 func (context *context) SetBeamSize(n int) {
 	context.params.SetBeamSize(n)
 }
 // Set Entropy threshold
 func (context *context) SetEntropyThold(t float32) {
 	context.params.SetEntropyThold(t)
 }
 // Set Temperature
 func (context *context) SetTemperature(t float32) {
 	context.params.SetTemperature(t)
 }
 // Set the fallback temperature incrementation
 // Pass -1.0 to disable this feature
 func (context *context) SetTemperatureFallback(t float32) {
 	context.params.SetTemperatureFallback(t)
 }
 // Set initial prompt
 func (context *context) SetInitialPrompt(prompt string) {
 	context.params.SetInitialPrompt(prompt)
--- a/bindings/go/pkg/whisper/context_test.go
+++ b/bindings/go/pkg/whisper/context_test.go
@ -4,90 +4,52 @@ import (
 	"os"
 	"testing"
-	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	// Packages
-	"github.com/go-audio/wav"
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	assert "github.com/stretchr/testify/assert"
 )
-func TestSetLanguage(t *testing.T) {
+const (
-	assert := assert.New(t)
+	ModelPath  = "../../models/ggml-tiny.bin"
 	SamplePath = "../../samples/jfk.wav"
 )
 func Test_Whisper_000(t *testing.T) {
 	assert := assert.New(t)
 	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
 		t.Skip("Skipping test, model not found:", ModelPath)
 	}
 	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
 		t.Skip("Skipping test, sample not found:", SamplePath)
 	}
 	// Load model
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	assert.NoError(model.Close())
 	t.Log("languages=", model.Languages())
 }
 func Test_Whisper_001(t *testing.T) {
 	assert := assert.New(t)
 	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
 		t.Skip("Skipping test, model not found:", ModelPath)
 	}
 	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
 		t.Skip("Skipping test, sample not found:", SamplePath)
 	}
 	// Load model
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
-	context, err := model.NewContext()
+	// Get context for decoding
 	ctx, err := model.NewContext()
 	assert.NoError(err)
 	assert.NotNil(ctx)
 	// This returns an error since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	err = context.SetLanguage("en")
 	assert.Error(err)
 }
 func TestContextModelIsMultilingual(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	isMultilingual := context.IsMultilingual()
 	// This returns false since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	assert.False(isMultilingual)
 }
 func TestLanguage(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	// This always returns en since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	expectedLanguage := "en"
 	actualLanguage := context.Language()
 	assert.Equal(expectedLanguage, actualLanguage)
 }
 func TestProcess(t *testing.T) {
 	assert := assert.New(t)
 	fh, err := os.Open(SamplePath)
 	assert.NoError(err)
 	defer fh.Close()
 	// Decode the WAV file - load the full buffer
 	dec := wav.NewDecoder(fh)
 	buf, err := dec.FullPCMBuffer()
 	assert.NoError(err)
 	assert.Equal(uint16(1), dec.NumChans)
 	data := buf.AsFloat32Buffer().Data
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	err = context.Process(data, nil, nil)
 	assert.NoError(err)
 }
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -38,22 +38,17 @@ type Context interface {
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language
-	SetOffset(time.Duration)          // Set offset
+	SetOffset(time.Duration)        // Set offset
-	SetDuration(time.Duration)        // Set duration
+	SetDuration(time.Duration)      // Set duration
-	SetThreads(uint)                  // Set number of threads to use
+	SetThreads(uint)                // Set number of threads to use
-	SetSplitOnWord(bool)              // Set split on word flag
+	SetSplitOnWord(bool)            // Set split on word flag
-	SetTokenThreshold(float32)        // Set timestamp token probability threshold
+	SetTokenThreshold(float32)      // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32)     // Set timestamp token sum probability threshold
+	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)         // Set max segment length in characters
+	SetMaxSegmentLength(uint)       // Set max segment length in characters
-	SetTokenTimestamps(bool)          // Set token timestamps flag
+	SetTokenTimestamps(bool)        // Set token timestamps flag
-	SetMaxTokensPerSegment(uint)      // Set max tokens per segment (0 = no limit)
+	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
-	SetAudioCtx(uint)                 // Set audio encoder context
+	SetAudioCtx(uint)               // Set audio encoder context
-	SetMaxContext(n int)              // Set maximum number of text context tokens to store
+	SetInitialPrompt(prompt string) // Set initial prompt
 	SetBeamSize(n int)                // Set Beam Size
 	SetEntropyThold(t float32)        // Set Entropy threshold
 	SetInitialPrompt(prompt string)   // Set initial prompt
 	SetTemperature(t float32)         // Set temperature
 	SetTemperatureFallback(t float32) // Set temperature incrementation
 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
--- a/bindings/go/pkg/whisper/model_test.go
+++ b/bindings/go/pkg/whisper/model_test.go
@ -1,91 +0,0 @@
 package whisper_test
 import (
 	"testing"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	assert "github.com/stretchr/testify/assert"
 )
 func TestNew(t *testing.T) {
 	assert := assert.New(t)
 	t.Run("valid model path", func(t *testing.T) {
 		model, err := whisper.New(ModelPath)
 		assert.NoError(err)
 		assert.NotNil(model)
 		defer model.Close()
 	})
 	t.Run("invalid model path", func(t *testing.T) {
 		invalidModelPath := "invalid-model-path.bin"
 		model, err := whisper.New(invalidModelPath)
 		assert.Error(err)
 		assert.Nil(model)
 	})
 }
 func TestClose(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	err = model.Close()
 	assert.NoError(err)
 }
 func TestNewContext(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	assert.NotNil(context)
 }
 func TestIsMultilingual(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	isMultilingual := model.IsMultilingual()
 	// This returns false since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	assert.False(isMultilingual)
 }
 func TestLanguages(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	expectedLanguages := []string{
 		"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
 		"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
 		"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
 		"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
 		"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
 		"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
 		"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
 		"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
 		"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
 	}
 	actualLanguages := model.Languages()
 	assert.Equal(expectedLanguages, actualLanguages)
 }
--- a/bindings/go/pkg/whisper/util_test.go
+++ b/bindings/go/pkg/whisper/util_test.go
@ -1,6 +0,0 @@
 package whisper_test
 const (
 	ModelPath  = "../../models/ggml-small.en.bin"
 	SamplePath = "../../samples/jfk.wav"
 )
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -9,7 +9,7 @@ import (
 // CGO
 /*
-#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
+#cgo LDFLAGS: -lwhisper -lm -lstdc++
 #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
 #include <whisper.h>
 #include <stdlib.h>
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.7.2",
+  "version": "1.6.2",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/.gitignore
+++ b/bindings/ruby/.gitignore
@ -1,3 +0,0 @@
 LICENSE
 pkg/
 lib/whisper.*
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@ -1,169 +0,0 @@
 whispercpp
 ==========
 ![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
 Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
 Installation
 ------------
 Install the gem and add to the application's Gemfile by executing:
    $ bundle add whispercpp
 If bundler is not being used to manage dependencies, install the gem by executing:
    $ gem install whispercpp
 Usage
 -----
 ```ruby
 require "whisper"
 whisper = Whisper::Context.new("path/to/model.bin")
 params = Whisper::Params.new
 params.language = "en"
 params.offset = 10_000
 params.duration = 60_000
 params.max_text_tokens = 300
 params.translate = true
 params.print_timestamps = false
 params.initial_prompt = "Initial prompt here."
 whisper.transcribe("path/to/audio.wav", params) do |whole_text|
  puts whole_text
 end
 ```
 ### Preparing model ###
 Use script to download model file(s):
 ```bash
 git clone https://github.com/ggerganov/whisper.cpp.git
 cd whisper.cpp
 sh ./models/download-ggml-model.sh base.en
 ```
 There are some types of models. See [models][] page for details.
 ### Preparing audio file ###
 Currently, whisper.cpp accepts only 16-bit WAV files.
 ### API ###
 Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
 ```ruby
 def format_time(time_ms)
  sec, decimal_part = time_ms.divmod(1000)
  min, sec = sec.divmod(60)
  hour, min = min.divmod(60)
  "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
 end
 whisper.transcribe("path/to/audio.wav", params)
 whisper.each_segment.with_index do |segment, index|
  line = "[%{nth}: %{st} --> %{ed}] %{text}" % {
    nth: index + 1,
    st: format_time(segment.start_time),
    ed: format_time(segment.end_time),
    text: segment.text
  }
  line << " (speaker turned)" if segment.speaker_next_turn?
  puts line
 end
 ```
 You can also add hook to params called on new segment:
 ```ruby
 def format_time(time_ms)
  sec, decimal_part = time_ms.divmod(1000)
  min, sec = sec.divmod(60)
  hour, min = min.divmod(60)
  "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
 end
 # Add hook before calling #transcribe
 params.on_new_segment do |segment|
  line = "[%{st} --> %{ed}] %{text}" % {
    st: format_time(segment.start_time),
    ed: format_time(segment.end_time),
    text: segment.text
  }
  line << " (speaker turned)" if segment.speaker_next_turn?
  puts line
 end
 whisper.transcribe("path/to/audio.wav", params)
 ```
 You can see model information:
 ```ruby
 whisper = Whisper::Context.new("path/to/model.bin")
 model = whisper.model
 model.n_vocab # => 51864
 model.n_audio_ctx # => 1500
 model.n_audio_state # => 512
 model.n_audio_head # => 8
 model.n_audio_layer # => 6
 model.n_text_ctx # => 448
 model.n_text_state # => 512
 model.n_text_head # => 8
 model.n_text_layer # => 6
 model.n_mels # => 80
 model.ftype # => 1
 model.type # => "base"
 ```
 You can set log callback:
 ```ruby
 prefix = "[MyApp] "
 log_callback = ->(level, buffer, user_data) {
  case level
  when Whisper::LOG_LEVEL_NONE
    puts "#{user_data}none: #{buffer}"
  when Whisper::LOG_LEVEL_INFO
    puts "#{user_data}info: #{buffer}"
  when Whisper::LOG_LEVEL_WARN
    puts "#{user_data}warn: #{buffer}"
  when Whisper::LOG_LEVEL_ERROR
    puts "#{user_data}error: #{buffer}"
  when Whisper::LOG_LEVEL_DEBUG
    puts "#{user_data}debug: #{buffer}"
  when Whisper::LOG_LEVEL_CONT
    puts "#{user_data}same to previous: #{buffer}"
  end
 }
 Whisper.log_set log_callback, prefix
 ```
 Using this feature, you are also able to suppress log:
 ```ruby
 Whisper.log_set ->(level, buffer, user_data) {
  # do nothing
 }, nil
 Whisper::Context.new(MODEL)
 ```
 License
 -------
 The same to [whisper.cpp][].
 [whisper.cpp]: https://github.com/ggerganov/whisper.cpp
 [models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
--- a/bindings/ruby/Rakefile
+++ b/bindings/ruby/Rakefile
@ -1,68 +1,12 @@
 require 'rake/clean'
-require "bundler/gem_tasks"
+  require 'rubygems/package'
 require "pathname"
 require "yaml"
 require "rake/testtask"
-extsources = YAML.load_file("extsources.yaml")
+desc 'Build gem'
-SOURCES = FileList[]
+task :package do
-extsources.each do |src|
+  spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
-  basename = src.pathmap("%f")
+  spec = nil
-  dest = basename == "LICENSE" ? basename : basename.pathmap("ext/%f")
+  # see: http://gist.github.com/16215
-  file src
+  Thread.new { spec = eval("#{spec_source}") }.join
-  file dest => src do |t|
+  spec.validate
-    cp t.source, t.name
+  Gem::Package.build(spec)
  end
  SOURCES.include dest
 end
 CLEAN.include SOURCES
 CLEAN.include FileList[
                "ext/*.o",
                "ext/*.metal",
                "ext/whisper.{so,bundle,dll}",
                "ext/depend"
              ]
 task build: FileList[
       "ext/Makefile",
       "ext/ruby_whisper.h",
       "ext/ruby_whisper.cpp",
       "whispercpp.gemspec",
     ]
 directory "pkg"
 CLOBBER.include "pkg"
 TEST_MODEL = "../../models/ggml-base.en.bin"
 LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
 SO_FILE = File.join("ext", LIB_NAME)
 LIB_FILE = File.join("lib", LIB_NAME)
 file "ext/Makefile" => ["ext/extconf.rb", "ext/ruby_whisper.h", "ext/ruby_whisper.cpp"] + SOURCES do |t|
  Dir.chdir "ext" do
    ruby "extconf.rb"
  end
 end
 file SO_FILE => "ext/Makefile" do |t|
  Dir.chdir "ext" do
    sh "make"
  end
 end
 CLEAN.include LIB_FILE
 directory "lib"
 file LIB_FILE => [SO_FILE, "lib"] do |t|
  copy t.source, t.name
 end
 Rake::TestTask.new do |t|
  t.test_files = FileList["tests/test_*.rb"]
 end
 task test: [TEST_MODEL, LIB_FILE]
 file TEST_MODEL do
  Dir.chdir "../.." do
    sh "./models/download-ggml-model.sh base.en"
  end
 end
--- a/bindings/ruby/ext/.gitignore
+++ b/bindings/ruby/ext/.gitignore
@ -3,33 +3,7 @@ ggml.c
 ggml.h
 ggml-alloc.c
 ggml-alloc.h
-ggml-aarch64.c
+whisper.bundle
 ggml-aarch64.h
 ggml-backend.cpp
 ggml-backend-impl.h
 ggml-backend.c
 ggml-backend.h
 ggml-common.h
 ggml-cpu-impl.h
 ggml-metal.m
 ggml-metal.metal
 ggml-metal-embed.metal
 ggml-blas.cpp
 ggml-cuda.h
 ggml-impl.h
 ggml-kompute.h
 ggml-metal.h
 ggml-opencl.h
 ggml-quants.c
 ggml-quants.h
 ggml-sycl.h
 ggml-vulkan.h
 ggml-blas.h
 get-flags.mk
 whisper.cpp
 whisper.h
 dr_wav.h
 depend
 whisper.bundle
 whisper.so
 whisper.dll
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -1,10 +1,23 @@
 require 'mkmf'
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
 # need to use c++ compiler flags
 $CXXFLAGS << ' -std=c++11'
 $LDFLAGS << ' -lstdc++'
 # Set to true when building binary gems
 if enable_config('static-stdlib', false)
  $LDFLAGS << ' -static-libgcc -static-libstdc++'
@ -15,180 +28,4 @@ if enable_config('march-tune-native', false)
  $CXXFLAGS << ' -march=native -mtune=native'
 end
 if ENV['WHISPER_METAL']
  $GGML_METAL ||= true
  $DEPRECATE_WARNING ||= true
 end
 $UNAME_S = `uname -s`.chomp
 $UNAME_P = `uname -p`.chomp
 $UNAME_M = `uname -m`.chomp
 if $UNAME_S == 'Darwin'
  unless ENV['GGML_NO_METAL']
    $GGML_METAL ||= true
  end
  $GGML_NO_OPENMP ||= true
 end
 if $GGML_METAL
  $GGML_METAL_EMBED_LIBRARY = true
 end
 $MK_CPPFLAGS = ''
 $MK_CFLAGS   = '-std=c11   -fPIC'
 $MK_CXXFLAGS = '-std=c++11 -fPIC'
 $MK_NVCCFLAGS = '-std=c++11'
 $MK_LDFLAGS = ''
 $OBJ_GGML = []
 $OBJ_WHISPER = []
 $OBJ_COMMON = []
 $OBJ_SDL = []
 $MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
 if $UNAME_S == 'Linux'
  $MK_CPPFLAGS << ' -D_GNU_SOURCE'
 end
 if $UNAME_S == 'Darwin'
  $MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
 end
 if ENV['WHISPER_DEBUG']
  $MK_CFLAGS    << ' -O0 -g'
  $MK_CXXFLAGS  << ' -O0 -g'
  $MK_LDFLAGS   << ' -g'
  $MK_NVCCFLAGS << ' -O0 -g'
 else
  $MK_CPPFLAGS   << ' -DNDEBUG'
  $MK_CFLAGS     << ' -O3'
  $MK_CXXFLAGS   << ' -O3'
  $MK_NVCCFLAGS  << ' -O3'
 end
 $WARN_FLAGS =
  ' -Wall' <<
  ' -Wextra' <<
  ' -Wpedantic' <<
  ' -Wcast-qual' <<
  ' -Wno-unused-function'
 $MK_CFLAGS <<
  $WARN_FLAGS <<
  ' -Wshadow' <<
  ' -Wstrict-prototypes' <<
  ' -Wpointer-arith' <<
  ' -Wmissing-prototypes' <<
  ' -Werror=implicit-int' <<
  ' -Werror=implicit-function-declaration'
 $MK_CXXFLAGS <<
  $WARN_FLAGS <<
  ' -Wmissing-declarations' <<
  ' -Wmissing-noreturn'
 unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
  $MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
 end
 if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
  $MK_CFLAGS   << ' -pthread'
  $MK_CXXFLAGS << ' -pthread'
 end
 unless $_WIN32
  $DSO_EXT = '.so'
 else
  $DSO_EXT = '.dll'
 end
 unless ENV['RISCV']
  if %w[x86_64 i686 amd64].include? $UNAME_M
    $HOST_CXXFLAGS ||= ''
    $MK_CFLAGS     << ' -march=native -mtune=native'
    $HOST_CXXFLAGS << ' -march=native -mtune=native'
  end
  if $UNAME_M.match? /aarch64.*/
    $MK_CFLAGS   << ' -mcpu=native'
    $MK_CXXFLAGS << ' -mcpu=native'
  end
 else
  $MK_CFLAGS   << ' -march=rv64gcv -mabi=lp64d'
  $MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
 end
 unless ENV['GGML_NO_ACCELERATE']
  if $UNAME_S == 'Darwin'
    $MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS'
    $MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
    $MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
    $MK_LDFLAGS  << ' -framework Accelerate'
    $OBJ_GGML    << 'ggml-blas.o'
  end
 end
 if ENV['GGML_OPENBLAS']
  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
  $MK_LDFLAGS  << " #{`pkg-config --libs openblas`}"
  $OBJ_GGML    << 'ggml-blas.o'
 end
 if ENV['GGML_OPENBLAS64']
  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
  $MK_LDFLAGS  << " #{`pkg-config --libs openblas64`}"
  $OBJ_GGML    << 'ggml-blas.o'
 end
 if $GGML_METAL
  $MK_CPPFLAGS << ' -DGGML_USE_METAL'
  $MK_LDFLAGS  << ' -framework Foundation -framework Metal -framework MetalKit'
  $OBJ_GGML    << 'ggml-metal.o'
  if ENV['GGML_METAL_NDEBUG']
    $MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
  end
  if $GGML_METAL_EMBED_LIBRARY
    $MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
    $OBJ_GGML    << 'ggml-metal-embed.o'
  end
 end
 $OBJ_GGML <<
  'ggml.o' <<
  'ggml-cpu.o' <<
  'ggml-alloc.o' <<
  'ggml-backend.o' <<
  'ggml-quants.o' <<
  'ggml-aarch64.o'
 $OBJ_WHISPER <<
  'whisper.o'
 $objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
 $objs << "ruby_whisper.o"
 $CPPFLAGS  = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
 $CFLAGS    = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
 $BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
 $CXXFLAGS  = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
 $NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
 $LDFLAGS   = "#{$MK_LDFLAGS} #{$LDFLAGS}"
 create_makefile('whisper')
 File.open 'Makefile', 'a' do |file|
  file.puts 'include get-flags.mk'
  if $GGML_METAL
    if $GGML_METAL_EMBED_LIBRARY
      file.puts 'include metal-embed.mk'
    end
  end
 end
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@ -0,0 +1,141 @@
 #pragma once
 // ggml-backend internal header
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
    //
    // Backend buffer
    //
    // buffer type
    typedef void * ggml_backend_buffer_type_context_t;
    struct ggml_backend_buffer_type_i {
        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
        // check if tensor data is in host memory
        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
    };
    struct ggml_backend_buffer_type {
        struct ggml_backend_buffer_type_i  iface;
        ggml_backend_buffer_type_context_t context;
    };
    // buffer
    typedef void * ggml_backend_buffer_context_t;
    struct ggml_backend_buffer_i {
        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
    };
    struct ggml_backend_buffer {
        struct ggml_backend_buffer_i  iface;
        ggml_backend_buffer_type_t    buft;
        ggml_backend_buffer_context_t context;
        size_t size;
        enum ggml_backend_buffer_usage usage;
    };
    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
                   ggml_backend_buffer_type_t      buft,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);
    // do not use directly, use ggml_backend_tensor_copy instead
    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
    // buffer that contains a collection of buffers
    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    //
    // Backend
    //
    typedef void * ggml_backend_context_t;
    struct ggml_backend_i {
        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
        void (*GGML_CALL free)(ggml_backend_t backend);
        // buffer allocation
        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
        // (optional) asynchronous tensor data access
        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
        // (optional) complete all pending operations
        void (*GGML_CALL synchronize)(ggml_backend_t backend);
        // compute graph with a plan (not used currently)
        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph with a plan
        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan (async)
        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // check if the backend supports an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
        // these should be expensive operations with large batch sizes that may benefit from running on this backend
        // even if the weight has to be copied from the CPU temporarily
        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // (optional) event synchronization
        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
    };
    struct ggml_backend {
        ggml_guid_t guid;
        struct ggml_backend_i iface;
        ggml_backend_context_t context;
    };
    struct ggml_backend_event {
        ggml_backend_t backend;
        void * context;
    };
    //
    // Backend registry
    //
    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@ -0,0 +1,233 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-alloc.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
    typedef struct ggml_backend_event * ggml_backend_event_t;
    typedef struct ggml_backend * ggml_backend_t;
    typedef void * ggml_backend_graph_plan_t;
    //
    // Backend buffer
    //
    // buffer type
    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
    // buffer
    enum ggml_backend_buffer_usage {
        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
    };
    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
    //
    // Backend
    //
    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    // asynchronous copy
    // the copy is performed after all the currently queued operations in backend_src
    // backend_dst will wait for the copy to complete before performing other operations
    // automatic fallback to sync copy if async is not supported
    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
    // events
    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
    //
    // CPU backend
    //
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Create a backend buffer from an existing pointer
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 #ifdef GGML_USE_CPU_HBM
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
    //
    // Backend registry
    //
    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
    GGML_API size_t                     ggml_backend_reg_get_count(void);
    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
    //
    // Backend scheduler
    //
    // The backend scheduler allows for multiple backends to be used together
    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
    // The backends are selected based on:
    // - the backend that supports the operation
    // - the location of the pre-allocated tensors (e.g. the weights)
    /*
      Example usage:
        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
        // initialize buffers from a max size graph (optional)
        reserve_graph = build_graph(sched, max_batch_size);
        // manually assign nodes to a backend (optional, should not be needed in most cases)
        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
        ggml_backend_sched_reserve(sched, reserve_graph);
        // compute
        graph = build_graph(sched);
        ggml_backend_sched_graph_compute(sched, graph);
        // if there are graph inputs:
        ggml_backend_sched_reset(sched);
        ggml_backend_sched_alloc_graph(sched, graph);
        ggml_backend_tensor_set(input_tensor, ...);
        ggml_backend_sched_graph_compute(sched, graph);
    }
    */
    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;
    // when ask == true, the scheduler wants to know if the user wants to observe this node
    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
    //
    // when ask == false, the scheduler is passing the node tensor to the user for observation
    // if the user returns false, the scheduler will cancel the graph compute
    //
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
    // Initialize a backend scheduler
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
    // Get the number of splits of the last graph
    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Allocate and compute graph on the backend scheduler
    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
    // Reset all assignments and allocators - must be called before changing the node backends
    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
    // Set a callback to be called for each resulting node during graph compute
    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
    //
    // Utils
    //
    struct ggml_backend_graph_copy {
        ggml_backend_buffer_t buffer;
        struct ggml_context * ctx_allocated;
        struct ggml_context * ctx_unallocated;
        struct ggml_cgraph * graph;
    };
    // Copy a graph to a different backend
    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
    // Compare the output of two backends
    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
    // Tensor initialization
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-common.h
+++ b/bindings/ruby/ext/ggml-common.h
--- a/bindings/ruby/ext/ggml-cuda.h
+++ b/bindings/ruby/ext/ggml-cuda.h
@ -0,0 +1,43 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
 #endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_CUDA_MAX_DEVICES       16
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
 GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
 // device buffer
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
 GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-impl.h
+++ b/bindings/ruby/ext/ggml-impl.h
@ -0,0 +1,272 @@
 #pragma once
 #include "ggml.h"
 // GGML internal header
 #include <assert.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
 #include <math.h>   // fabsf
 #ifdef __cplusplus
 extern "C" {
 #endif
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef __cplusplus
 #ifndef static_assert
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 #endif
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__
 #define __FMA__
 #endif
 #ifndef __F16C__
 #define __F16C__
 #endif
 #endif
 // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
 #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __SSE3__
 #define __SSE3__
 #endif
 #ifndef __SSSE3__
 #define __SSSE3__
 #endif
 #endif
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
 #if defined(__ARM_NEON) && !defined(_MSC_VER)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 //
 #include <arm_neon.h>
 typedef __fp16 ggml_fp16_internal_t;
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    ggml_fp16_internal_t tmp;
    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
    return (float)tmp;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    ggml_fp16_t res;
    ggml_fp16_internal_t tmp = f;
    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
    return res;
 }
 #else
 typedef uint16_t ggml_fp16_internal_t;
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
 #ifdef __POWER9_VECTOR__
 #include <altivec.h>
 #undef bool
 #define bool _Bool
 #else
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
 #if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
 #endif
 #endif
 #endif
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
 #endif
 #ifdef __F16C__
 #ifdef _MSC_VER
 #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
 #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
 #else
 #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 #endif
 #elif defined(__POWER9_VECTOR__)
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 /* the inline asm below is about 12% faster than the lookup method */
 #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    register float f;
    register double d;
    __asm__(
        "mtfprd %0,%2\n"
        "xscvhpdp %0,%0\n"
        "frsp %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=f"(f):
        /* in */   "r"(h));
    return f;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    register double d;
    register ggml_fp16_t r;
    __asm__( /* xscvdphp can work on double or single precision */
        "xscvdphp %0,%2\n"
        "mffprd %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=r"(r):
        /* in */   "f"(f));
    return r;
 }
 #else
 // FP16 <-> FP32
 // ref: https://github.com/Maratyszcza/FP16
 static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
        float as_value;
    } fp32;
    fp32.as_bits = w;
    return fp32.as_value;
 }
 static inline uint32_t fp32_to_bits(float f) {
    union {
        float as_value;
        uint32_t as_bits;
    } fp32;
    fp32.as_value = f;
    return fp32.as_bits;
 }
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    const uint32_t w = (uint32_t) h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t two_w = w + w;
    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float exp_scale = 0x1.0p-112f;
 #else
    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
 #endif
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
    const uint32_t magic_mask = UINT32_C(126) << 23;
    const float magic_bias = 0.5f;
    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
    const uint32_t result = sign |
        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
    return fp32_from_bits(result);
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float scale_to_inf = 0x1.0p+112f;
    const float scale_to_zero = 0x1.0p-110f;
 #else
    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
 #endif
    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
    const uint32_t w = fp32_to_bits(f);
    const uint32_t shl1_w = w + w;
    const uint32_t sign = w & UINT32_C(0x80000000);
    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
    if (bias < UINT32_C(0x71000000)) {
        bias = UINT32_C(0x71000000);
    }
    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
    const uint32_t bits = fp32_to_bits(base);
    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #endif // __F16C__
 #endif // __ARM_NEON
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
    return ggml_table_f32_f16[s];
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 #endif
 #if !defined(GGML_FP32_TO_FP16)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 #define GGML_HASHTABLE_FULL ((size_t)-1)
 #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
 struct ggml_hash_set ggml_hash_set_new(size_t size);
 bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // return index, asserts if table is full
 size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-kompute.h
+++ b/bindings/ruby/ext/ggml-kompute.h
@ -0,0 +1,46 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct ggml_vk_device {
    int index;
    int type; // same as VkPhysicalDeviceType
    size_t heapSize;
    const char * name;
    const char * vendor;
    int subgroupSize;
    uint64_t bufferAlignment;
    uint64_t maxAlloc;
 };
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
 bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
 bool ggml_vk_has_device(void);
 struct ggml_vk_device ggml_vk_current_device(void);
 //
 // backend API
 //
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-metal.h
+++ b/bindings/ruby/ext/ggml-metal.h
@ -0,0 +1,66 @@
 // An interface allowing to compute ggml_cgraph with Metal
 //
 // This is a fully functional interface that extends ggml with GPU support for Apple devices.
 // A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
 //
 // How it works?
 //
 // As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
 // interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
 // use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
 //
 // You only need to make sure that all memory buffers that you used during the graph creation
 // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
 // used during the graph evaluation to determine the arguments of the compute kernels.
 //
 // Synchronization between device and host memory (for example for input and output tensors)
 // is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
 //
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stddef.h>
 #include <stdbool.h>
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
 struct ggml_tensor;
 struct ggml_cgraph;
 #ifdef __cplusplus
 extern "C" {
 #endif
 //
 // backend API
 // user-code should use only these functions
 //
 GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
 GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
 // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
 GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-opencl.h
+++ b/bindings/ruby/ext/ggml-opencl.h
@ -0,0 +1,36 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 GGML_API void ggml_cl_init(void);
 GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
 GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 // GGML_API void * ggml_cl_host_malloc(size_t size);
 // GGML_API void   ggml_cl_host_free(void * ptr);
 GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
 GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
 // backend API
 // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
 // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
 // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-quants.c
+++ b/bindings/ruby/ext/ggml-quants.c
--- a/bindings/ruby/ext/ggml-quants.h
+++ b/bindings/ruby/ext/ggml-quants.h
@ -0,0 +1,133 @@
 #pragma once
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "ggml.h"
 // GGML internal header
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Quantization
 void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-sycl.h
+++ b/bindings/ruby/ext/ggml-sycl.h
@ -0,0 +1,49 @@
 //
 //  MIT license
 //  Copyright (C) 2024 Intel Corporation
 //  SPDX-License-Identifier: MIT
 //
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_SYCL_MAX_DEVICES       48
 #define GGML_SYCL_NAME "SYCL"
 // backend API
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 // devide buffer
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 // TODO: these are temporary
 //       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
 GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
 GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-vulkan.h
+++ b/bindings/ruby/ext/ggml-vulkan.h
@ -0,0 +1,29 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
 GGML_API void ggml_vk_instance_init(void);
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
 GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
 GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
 GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
 GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/metal-embed.mk
+++ b/bindings/ruby/ext/metal-embed.mk
@ -1,14 +0,0 @@
 ggml-metal-embed.o: \
 	ggml-metal.metal \
 	ggml-common.h
 	@echo "Embedding Metal library"
 	@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
 	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)
 	@echo ".incbin \"ggml-metal-embed.metal\""          >> $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@ -3,13 +3,6 @@
 #include "whisper.h"
 typedef struct {
  VALUE *context;
  VALUE user_data;
  VALUE callback;
  VALUE callbacks;
 } ruby_whisper_callback_container;
 typedef struct {
  struct whisper_context *context;
 } ruby_whisper;
@ -17,9 +10,6 @@ typedef struct {
 typedef struct {
  struct whisper_full_params params;
  bool diarize;
  ruby_whisper_callback_container *new_segment_callback_container;
  ruby_whisper_callback_container *progress_callback_container;
  ruby_whisper_callback_container *abort_callback_container;
 } ruby_whisper_params;
 #endif
--- a/bindings/ruby/extsources.yaml
+++ b/bindings/ruby/extsources.yaml
@ -1,31 +0,0 @@
 ---
 - ../../src/whisper.cpp
 - ../../include/whisper.h
 - ../../ggml/src/ggml.c
 - ../../ggml/src/ggml-cpu.c
 - ../../ggml/src/ggml-impl.h
 - ../../ggml/src/ggml-aarch64.h
 - ../../ggml/src/ggml-aarch64.c
 - ../../ggml/src/ggml-alloc.c
 - ../../ggml/src/ggml-backend-impl.h
 - ../../ggml/src/ggml-backend.cpp
 - ../../ggml/src/ggml-common.h
 - ../../ggml/src/ggml-quants.h
 - ../../ggml/src/ggml-quants.c
 - ../../ggml/src/ggml-cpu-impl.h
 - ../../ggml/src/ggml-metal.m
 - ../../ggml/src/ggml-metal.metal
 - ../../ggml/src/ggml-blas.cpp
 - ../../ggml/include/ggml.h
 - ../../ggml/include/ggml-alloc.h
 - ../../ggml/include/ggml-backend.h
 - ../../ggml/include/ggml-cpu.h
 - ../../ggml/include/ggml-cuda.h
 - ../../ggml/include/ggml-kompute.h
 - ../../ggml/include/ggml-metal.h
 - ../../ggml/include/ggml-sycl.h
 - ../../ggml/include/ggml-vulkan.h
 - ../../ggml/include/ggml-blas.h
 - ../../scripts/get-flags.mk
 - ../../examples/dr_wav.h
 - ../../LICENSE
--- a/bindings/ruby/tests/helper.rb
+++ b/bindings/ruby/tests/helper.rb
@ -1,7 +0,0 @@
 require "test/unit"
 require "whisper"
 class TestBase < Test::Unit::TestCase
  MODEL = File.join(__dir__, "..", "..", "..", "models", "ggml-base.en.bin")
  AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
 end
--- a/bindings/ruby/tests/test_callback.rb
+++ b/bindings/ruby/tests/test_callback.rb
@ -1,163 +0,0 @@
 require "test/unit"
 require "whisper"
 class TestCallback < Test::Unit::TestCase
  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
  def setup
    GC.start
    @params = Whisper::Params.new
    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    @audio = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
  end
  def test_new_segment_callback
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      assert_kind_of Integer, n_new
      assert n_new > 0
      assert_same @whisper, context
      n_segments = context.full_n_segments
      n_new.times do |i|
        i_segment = n_segments - 1 + i
        start_time = context.full_get_segment_t0(i_segment) * 10
        end_time = context.full_get_segment_t1(i_segment) * 10
        text = context.full_get_segment_text(i_segment)
        assert_kind_of Integer, start_time
        assert start_time >= 0
        assert_kind_of Integer, end_time
        assert end_time > 0
        assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
      end
    }
    @whisper.transcribe(@audio, @params)
  end
  def test_new_segment_callback_closure
    search_word = "what"
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      n_segments = context.full_n_segments
      n_new.times do |i|
        i_segment = n_segments - 1 + i
        text = context.full_get_segment_text(i_segment)
        if text.include?(search_word)
          t0 = context.full_get_segment_t0(i_segment)
          t1 = context.full_get_segment_t1(i_segment)
          raise "search word '#{search_word}' found at between #{t0} and #{t1}"
        end
      end
    }
    assert_raise RuntimeError do
      @whisper.transcribe(@audio, @params)
    end
  end
  def test_new_segment_callback_user_data
    udata = Object.new
    @params.new_segment_callback_user_data = udata
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      assert_same udata, user_data
    }
    @whisper.transcribe(@audio, @params)
  end
  def test_new_segment_callback_user_data_gc
    @params.new_segment_callback_user_data = "My user data"
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      assert_equal "My user data", user_data
    }
    GC.start
    assert_same @whisper, @whisper.transcribe(@audio, @params)
  end
  def test_progress_callback
    first = nil
    last = nil
    @params.progress_callback = ->(context, state, progress, user_data) {
      assert_kind_of Integer, progress
      assert 0 <= progress && progress <= 100
      assert_same @whisper, context
      first = progress if first.nil?
      last = progress
    }
    @whisper.transcribe(@audio, @params)
    assert_equal 0, first
    assert_equal 100, last
  end
  def test_progress_callback_user_data
    udata = Object.new
    @params.progress_callback_user_data = udata
    @params.progress_callback = ->(context, state, n_new, user_data) {
      assert_same udata, user_data
    }
    @whisper.transcribe(@audio, @params)
  end
  def test_on_progress
    first = nil
    last = nil
    @params.on_progress do |progress|
      assert_kind_of Integer, progress
      assert 0 <= progress && progress <= 100
      first = progress if first.nil?
      last = progress
    end
    @whisper.transcribe(@audio, @params)
    assert_equal 0, first
    assert_equal 100, last
  end
  def test_abort_callback
    i = 0
    @params.abort_callback = ->(user_data) {
      assert_nil user_data
      i += 1
      return false
    }
    @whisper.transcribe(@audio, @params)
    assert i > 0
  end
  def test_abort_callback_abort
    i = 0
    @params.abort_callback = ->(user_data) {
      i += 1
      return i == 3
    }
    @whisper.transcribe(@audio, @params)
    assert_equal 3, i
  end
  def test_abort_callback_user_data
    udata = Object.new
    @params.abort_callback_user_data = udata
    yielded = nil
    @params.abort_callback = ->(user_data) {
      yielded = user_data
    }
    @whisper.transcribe(@audio, @params)
    assert_same udata, yielded
  end
  def test_abort_on
    do_abort = false
    aborted_from_callback = false
    @params.on_new_segment do |segment|
      do_abort = true if segment.text.match? /ask/
    end
    i = 0
    @params.abort_on do
      i += 1
      do_abort
    end
    @whisper.transcribe(@audio, @params)
    assert i > 0
  end
 end
--- a/bindings/ruby/tests/test_model.rb
+++ b/bindings/ruby/tests/test_model.rb
@ -1,44 +0,0 @@
 require_relative "helper"
 class TestModel < TestBase
  def test_model
    whisper = Whisper::Context.new(MODEL)
    assert_instance_of Whisper::Model, whisper.model
  end
  def test_attributes
    whisper = Whisper::Context.new(MODEL)
    model = whisper.model
    assert_equal 51864, model.n_vocab
    assert_equal 1500, model.n_audio_ctx
    assert_equal 512, model.n_audio_state
    assert_equal 8, model.n_audio_head
    assert_equal 6, model.n_audio_layer
    assert_equal 448, model.n_text_ctx
    assert_equal 512, model.n_text_state
    assert_equal 8, model.n_text_head
    assert_equal 6, model.n_text_layer
    assert_equal 80, model.n_mels
    assert_equal 1, model.ftype
    assert_equal "base", model.type
  end
  def test_gc
    model = Whisper::Context.new(MODEL).model
    GC.start
    assert_equal 51864, model.n_vocab
    assert_equal 1500, model.n_audio_ctx
    assert_equal 512, model.n_audio_state
    assert_equal 8, model.n_audio_head
    assert_equal 6, model.n_audio_layer
    assert_equal 448, model.n_text_ctx
    assert_equal 512, model.n_text_state
    assert_equal 8, model.n_text_head
    assert_equal 6, model.n_text_layer
    assert_equal 80, model.n_mels
    assert_equal 1, model.ftype
    assert_equal "base", model.type
  end
 end
--- a/bindings/ruby/tests/test_package.rb
+++ b/bindings/ruby/tests/test_package.rb
@ -1,31 +0,0 @@
 require_relative "helper"
 require 'tempfile'
 require 'tmpdir'
 require 'shellwords'
 class TestPackage < TestBase
  def test_build
    Tempfile.create do |file|
      assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
      assert file.size > 0
      assert_path_exist file.to_path
    end
  end
  sub_test_case "Building binary on installation" do
    def setup
      system "rake", "build", exception: true
    end
    def test_install
      match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
      filename = match_data[1]
      version = match_data[2]
      basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
      Dir.mktmpdir do |dir|
        system "gem", "install", "--install-dir", dir.shellescape, "pkg/#{filename.shellescape}", exception: true
        assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", basename)
      end
    end
  end
 end
--- a/bindings/ruby/tests/test_params.rb
+++ b/bindings/ruby/tests/test_params.rb
@ -1,154 +0,0 @@
 require_relative "helper"
 class TestParams < TestBase
  def setup
    @params  = Whisper::Params.new
  end
  def test_language
    @params.language = "en"
    assert_equal @params.language, "en"
    @params.language = "auto"
    assert_equal @params.language, "auto"
  end
  def test_offset
    @params.offset = 10_000
    assert_equal @params.offset, 10_000
    @params.offset = 0
    assert_equal @params.offset, 0
  end
  def test_duration
    @params.duration = 60_000
    assert_equal @params.duration, 60_000
    @params.duration = 0
    assert_equal @params.duration, 0
  end
  def test_max_text_tokens
    @params.max_text_tokens = 300
    assert_equal @params.max_text_tokens, 300
    @params.max_text_tokens = 0
    assert_equal @params.max_text_tokens, 0
  end
  def test_translate
    @params.translate = true
    assert @params.translate
    @params.translate = false
    assert !@params.translate
  end
  def test_no_context
    @params.no_context = true
    assert @params.no_context
    @params.no_context = false
    assert !@params.no_context
  end
  def test_single_segment
    @params.single_segment = true
    assert @params.single_segment
    @params.single_segment = false
    assert !@params.single_segment
  end
  def test_print_special
    @params.print_special = true
    assert @params.print_special
    @params.print_special = false
    assert !@params.print_special
  end
  def test_print_progress
    @params.print_progress = true
    assert @params.print_progress
    @params.print_progress = false
    assert !@params.print_progress
  end
  def test_print_realtime
    @params.print_realtime = true
    assert @params.print_realtime
    @params.print_realtime = false
    assert !@params.print_realtime
  end
  def test_print_timestamps
    @params.print_timestamps = true
    assert @params.print_timestamps
    @params.print_timestamps = false
    assert !@params.print_timestamps
  end
  def test_suppress_blank
    @params.suppress_blank = true
    assert @params.suppress_blank
    @params.suppress_blank = false
    assert !@params.suppress_blank
  end
  def test_suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = true
    assert @params.suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = false
    assert !@params.suppress_non_speech_tokens
  end
  def test_token_timestamps
    @params.token_timestamps = true
    assert @params.token_timestamps
    @params.token_timestamps = false
    assert !@params.token_timestamps
  end
  def test_split_on_word
    @params.split_on_word = true
    assert @params.split_on_word
    @params.split_on_word = false
    assert !@params.split_on_word
  end
  def test_initial_prompt
    assert_nil @params.initial_prompt
    @params.initial_prompt = "You are a polite person."
    assert_equal "You are a polite person.", @params.initial_prompt
  end
  def test_temperature
    assert_equal 0.0, @params.temperature
    @params.temperature = 0.5
    assert_equal 0.5, @params.temperature
  end
  def test_max_initial_ts
    assert_equal 1.0, @params.max_initial_ts
    @params.max_initial_ts = 600.0
    assert_equal 600.0, @params.max_initial_ts
  end
  def test_length_penalty
    assert_equal -1.0, @params.length_penalty
    @params.length_penalty = 0.5
    assert_equal 0.5, @params.length_penalty
  end
  def test_temperature_inc
    assert_in_delta 0.2, @params.temperature_inc
    @params.temperature_inc = 0.5
    assert_in_delta 0.5, @params.temperature_inc
  end
  def test_entropy_thold
    assert_in_delta 2.4, @params.entropy_thold
    @params.entropy_thold = 3.0
    assert_in_delta 3.0, @params.entropy_thold
  end
  def test_logprob_thold
    assert_in_delta -1.0, @params.logprob_thold
    @params.logprob_thold = -0.5
    assert_in_delta -0.5, @params.logprob_thold
  end
 end
--- a/bindings/ruby/tests/test_segment.rb
+++ b/bindings/ruby/tests/test_segment.rb
@ -1,83 +0,0 @@
 require_relative "helper"
 class TestSegment < TestBase
  class << self
    attr_reader :whisper
    def startup
      @whisper = Whisper::Context.new(TestBase::MODEL)
      params = Whisper::Params.new
      params.print_timestamps = false
      @whisper.transcribe(TestBase::AUDIO, params)
    end
  end
  def test_iteration
    whisper.each_segment do |segment|
      assert_instance_of Whisper::Segment, segment
    end
  end
  def test_enumerator
    enum = whisper.each_segment
    assert_instance_of Enumerator, enum
    enum.to_a.each_with_index do |segment, index|
      assert_instance_of Whisper::Segment, segment
      assert_kind_of Integer, index
    end
  end
  def test_start_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal 0, segment.start_time if i == 0
      i += 1
    end
  end
  def test_end_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
      i += 1
    end
  end
  def test_on_new_segment
    params = Whisper::Params.new
    seg = nil
    index = 0
    params.on_new_segment do |segment|
      assert_instance_of Whisper::Segment, segment
      if index == 0
        seg = segment
        assert_equal 0, segment.start_time
        assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
      end
      index += 1
    end
    whisper.transcribe(AUDIO, params)
    assert_equal 0, seg.start_time
    assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
  end
  def test_on_new_segment_twice
    params = Whisper::Params.new
    seg = nil
    params.on_new_segment do |segment|
      seg = segment
      return
    end
    params.on_new_segment do |segment|
      assert_same seg, segment
      return
    end
    whisper.transcribe(AUDIO, params)
  end
  private
  def whisper
    self.class.whisper
  end
 end
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -1,127 +1,131 @@
-require_relative "helper"
+TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
-require "stringio"
+EXTDIR = File.join(TOPDIR, 'ext')
 #$LIBDIR = File.join(TOPDIR, 'lib')
 #$:.unshift(LIBDIR)
 $:.unshift(EXTDIR)
-# Exists to detect memory-related bug
+require 'whisper'
-Whisper.log_set ->(level, buffer, user_data) {}, nil
+require 'test/unit'
-class TestWhisper < TestBase
+class TestWhisper < Test::Unit::TestCase
  def setup
    @params  = Whisper::Params.new
  end
  def test_language
    @params.language = "en"
    assert_equal @params.language, "en"
    @params.language = "auto"
    assert_equal @params.language, "auto"
  end
  def test_offset
    @params.offset = 10_000
    assert_equal @params.offset, 10_000
    @params.offset = 0
    assert_equal @params.offset, 0
  end
  def test_duration
    @params.duration = 60_000
    assert_equal @params.duration, 60_000
    @params.duration = 0
    assert_equal @params.duration, 0
  end
  def test_max_text_tokens
    @params.max_text_tokens = 300
    assert_equal @params.max_text_tokens, 300
    @params.max_text_tokens = 0
    assert_equal @params.max_text_tokens, 0
  end
  def test_translate
    @params.translate = true
    assert @params.translate
    @params.translate = false
    assert !@params.translate
  end
  def test_no_context
    @params.no_context = true
    assert @params.no_context
    @params.no_context = false
    assert !@params.no_context
  end
  def test_single_segment
    @params.single_segment = true
    assert @params.single_segment
    @params.single_segment = false
    assert !@params.single_segment
  end
  def test_print_special
    @params.print_special = true
    assert @params.print_special
    @params.print_special = false
    assert !@params.print_special
  end
  def test_print_progress
    @params.print_progress = true
    assert @params.print_progress
    @params.print_progress = false
    assert !@params.print_progress
  end
  def test_print_realtime
    @params.print_realtime = true
    assert @params.print_realtime
    @params.print_realtime = false
    assert !@params.print_realtime
  end
  def test_print_timestamps
    @params.print_timestamps = true
    assert @params.print_timestamps
    @params.print_timestamps = false
    assert !@params.print_timestamps
  end
  def test_suppress_blank
    @params.suppress_blank = true
    assert @params.suppress_blank
    @params.suppress_blank = false
    assert !@params.suppress_blank
  end
  def test_suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = true
    assert @params.suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = false
    assert !@params.suppress_non_speech_tokens
  end
  def test_token_timestamps
    @params.token_timestamps = true
    assert @params.token_timestamps
    @params.token_timestamps = false
    assert !@params.token_timestamps
  end
  def test_split_on_word
    @params.split_on_word = true
    assert @params.split_on_word
    @params.split_on_word = false
    assert !@params.split_on_word
  end
  def test_whisper
-    @whisper = Whisper::Context.new(MODEL)
+    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    params  = Whisper::Params.new
    params.print_timestamps = false
-    @whisper.transcribe(AUDIO, params) {|text|
+    jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
    @whisper.transcribe(jfk, params) {|text|
      assert_match /ask not what your country can do for you, ask what you can do for your country/, text
    }
  end
  sub_test_case "After transcription" do
    class << self
      attr_reader :whisper
      def startup
        @whisper = Whisper::Context.new(TestBase::MODEL)
        params = Whisper::Params.new
        params.print_timestamps = false
        @whisper.transcribe(TestBase::AUDIO, params)
      end
    end
    def whisper
      self.class.whisper
    end
    def test_full_n_segments
      assert_equal 1, whisper.full_n_segments
    end
    def test_full_lang_id
      assert_equal 0, whisper.full_lang_id
    end
    def test_full_get_segment_t0
      assert_equal 0, whisper.full_get_segment_t0(0)
      assert_raise IndexError do
        whisper.full_get_segment_t0(whisper.full_n_segments)
      end
      assert_raise IndexError do
        whisper.full_get_segment_t0(-1)
      end
    end
    def test_full_get_segment_t1
      t1 = whisper.full_get_segment_t1(0)
      assert_kind_of Integer, t1
      assert t1 > 0
      assert_raise IndexError do
        whisper.full_get_segment_t1(whisper.full_n_segments)
      end
    end
    def test_full_get_segment_speaker_turn_next
      assert_false whisper.full_get_segment_speaker_turn_next(0)
    end
    def test_full_get_segment_text
      assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
    end
  end
  def test_lang_max_id
    assert_kind_of Integer, Whisper.lang_max_id
  end
  def test_lang_id
    assert_equal 0, Whisper.lang_id("en")
    assert_raise ArgumentError do
      Whisper.lang_id("non existing language")
    end
  end
  def test_lang_str
    assert_equal "en", Whisper.lang_str(0)
    assert_raise IndexError do
      Whisper.lang_str(Whisper.lang_max_id + 1)
    end
  end
  def test_lang_str_full
    assert_equal "english", Whisper.lang_str_full(0)
    assert_raise IndexError do
      Whisper.lang_str_full(Whisper.lang_max_id + 1)
    end
  end
  def test_log_set
    user_data = Object.new
    logs = []
    log_callback = ->(level, buffer, udata) {
      logs << [level, buffer, udata]
    }
    Whisper.log_set log_callback, user_data
    Whisper::Context.new(MODEL)
    assert logs.length > 30
    logs.each do |log|
      assert_equal Whisper::LOG_LEVEL_INFO, log[0]
      assert_same user_data, log[2]
    end
  end
  def test_log_suppress
    stderr = $stderr
    Whisper.log_set ->(level, buffer, user_data) {
      # do nothing
    }, nil
    dev = StringIO.new("")
    $stderr = dev
    Whisper::Context.new(MODEL)
    assert_empty dev.string
  ensure
    $stderr = stderr
  end
 end
--- a/bindings/ruby/whispercpp.gemspec
+++ b/bindings/ruby/whispercpp.gemspec
@ -1,5 +1,3 @@
 require "yaml"
 Gem::Specification.new do |s|
  s.name    = "whispercpp"
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
@ -9,16 +7,10 @@ Gem::Specification.new do |s|
  s.email   = 'todd.fisher@gmail.com'
  s.extra_rdoc_files = ['LICENSE', 'README.md']
-  s.files = `git ls-files . -z`.split("\x0") +
+  s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
              YAML.load_file("extsources.yaml").collect {|file|
                basename = File.basename(file)
                if s.extra_rdoc_files.include?(basename)
                  basename
                else
                  File.join("ext", basename)
                end
              }
  #### Load-time details
  s.require_paths = ['lib','ext']
  s.summary = %q{Ruby whisper.cpp bindings}
  s.test_files = ["tests/test_whisper.rb"]
--- a/cmake/DefaultTargetOptions.cmake
+++ b/cmake/DefaultTargetOptions.cmake
@ -13,5 +13,5 @@ set_target_properties(${TARGET}
    PROPERTIES
        EXPORT_COMPILE_COMMANDS ON
        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-        INSTALL_RPATH            "${CMAKE_INSTALL_PREFIX}/lib"
+        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
 )
--- a/cmake/whisper-config.cmake.in
+++ b/cmake/whisper-config.cmake.in
@ -1,7 +1,7 @@
-set(WHISPER_VERSION      @WHISPER_INSTALL_VERSION@)
+set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
-set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
+set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
-set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
+set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(WHISPER_SHARED_LIB   @BUILD_SHARED_LIBS@)
+set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
@ -11,9 +11,9 @@ set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@
-set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@")
+set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(WHISPER_LIB_DIR     "@PACKAGE_WHISPER_LIB_INSTALL_DIR@")
+set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(WHISPER_BIN_DIR     "@PACKAGE_WHISPER_BIN_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 # Ensure transient dependencies satisfied
@ -43,23 +43,23 @@ if (GGML_HIPBLAS)
    find_package(rocblas REQUIRED)
 endif()
-find_library(whisper_LIBRARY whisper
+find_library(llama_LIBRARY llama
    REQUIRED
-    HINTS ${WHISPER_LIB_DIR})
+    HINTS ${LLAMA_LIB_DIR})
-set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@")
+set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
-set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@")
+set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
-add_library(whisper UNKNOWN IMPORTED)
+add_library(llama UNKNOWN IMPORTED)
-set_target_properties(whisper
+set_target_properties(llama
    PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}"
+        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}"
+        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}"
+        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${whisper_LIBRARY}"
+        IMPORTED_LOCATION "${llama_LIBRARY}"
        INTERFACE_COMPILE_FEATURES cxx_std_11
        POSITION_INDEPENDENT_CODE ON )
-check_required_components(whisper)
+check_required_components(Llama)
--- a/cmake/whisper.pc.in
+++ b/cmake/whisper.pc.in
@ -1,6 +1,6 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+libdir=${exec_prefix}/lib
 includedir=${prefix}/include
 Name: whisper
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -40,7 +40,7 @@ if (WHISPER_FFMPEG)
    message(STATUS "Found ffmpeg libs:       ${FFMPEG_LIBRARIES}")
    message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
    message(STATUS "ffmpeg definitions:      ${FFMPEG_DEFINITIONS}")
-    message(STATUS "Found avformat           ${AVFORMAT_VERSION}")
+    message(STATUS "Found avformat ${AVFORMAT_VERSION}")
    include_directories(${FFMPEG_INCLUDE_DIRS})
    add_compile_definitions(WHISPER_FFMPEG)
@ -102,8 +102,8 @@ if (EMSCRIPTEN)
    set_target_properties(libstream PROPERTIES FOLDER "libs")
    add_subdirectory(command.wasm)
    set_target_properties(libcommand PROPERTIES FOLDER "libs")
-    #add_subdirectory(talk.wasm)
+    add_subdirectory(talk.wasm)
-    #set_target_properties(libtalk PROPERTIES FOLDER "libs")
+    set_target_properties(libtalk PROPERTIES FOLDER "libs")
    add_subdirectory(bench.wasm)
    set_target_properties(libbench PROPERTIES FOLDER "libs")
 elseif(CMAKE_JS_VERSION)
@ -127,17 +127,15 @@ endif (WHISPER_SDL2)
    add_subdirectory(quantize)
    set_target_properties(quantize PROPERTIES FOLDER "examples")
 if (WHISPER_SDL2)
-    # TODO: disabled until update
+    add_subdirectory(talk)
-    #       https://github.com/ggerganov/whisper.cpp/issues/1818
+    set_target_properties(talk PROPERTIES FOLDER "examples")
    #add_subdirectory(talk)
    #set_target_properties(talk PROPERTIES FOLDER "examples")
    add_subdirectory(talk-llama)
    set_target_properties(talk-llama PROPERTIES FOLDER "examples")
    add_subdirectory(lsp)
    set_target_properties(lsp PROPERTIES FOLDER "examples")
    if (GGML_SYCL)
        add_subdirectory(sycl)
-        set_target_properties(ls-sycl-device PROPERTIES FOLDER "examples")
+        set_target_properties(sycl PROPERTIES FOLDER "examples")
    endif()
 endif (WHISPER_SDL2)
 endif()
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -72,9 +72,6 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_MOSTLY_IQ4_XS:
        case GGML_FTYPE_MOSTLY_IQ1_M:
        case GGML_FTYPE_MOSTLY_BF16:
        case GGML_FTYPE_MOSTLY_Q4_0_4_4:
        case GGML_FTYPE_MOSTLY_Q4_0_4_8:
        case GGML_FTYPE_MOSTLY_Q4_0_8_8:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -212,11 +209,6 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_BF16:
                case GGML_TYPE_Q4_0_4_4:
                case GGML_TYPE_Q4_0_4_8:
                case GGML_TYPE_Q4_0_8_8:
                case GGML_TYPE_TQ1_0:
                case GGML_TYPE_TQ2_0:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -147,6 +147,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
        default: return "To";
    }
    return "The";
--- a/examples/common.h
+++ b/examples/common.h
@ -9,7 +9,6 @@
 #include <thread>
 #include <ctime>
 #include <fstream>
 #include <sstream>
 #define COMMON_SAMPLE_RATE 16000
@ -287,43 +286,12 @@ void sam_print_usage(int argc, char ** argv, const sam_params & params);
 // Terminal utils
 //
 #define SQR(X)    ((X) * (X))
 #define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
-/**
+// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
- * Quantizes 24-bit RGB to xterm256 code range [16,256).
+// Lowest is red, middle is yellow, highest is green.
 */
 static int rgb2xterm256(int r, int g, int b) {
    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
    int av, ir, ig, ib, il, qr, qg, qb, ql;
    av = r * .299 + g * .587 + b * .114 + .5;
    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
    qr = cube[(ir = UNCUBE(r))];
    qg = cube[(ig = UNCUBE(g))];
    qb = cube[(ib = UNCUBE(b))];
    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
        return ir * 36 + ig * 6 + ib + 020;
    return il + 0350;
 }
 static std::string set_xterm256_foreground(int r, int g, int b) {
    int x = rgb2xterm256(r, g, b);
    std::ostringstream oss;
    oss << "\033[38;5;" << x << "m";
    return oss.str();
 }
 // Lowest is red, middle is yellow, highest is green. Color scheme from
 // Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
 const std::vector<std::string> k_colors = {
-    set_xterm256_foreground(220,   5,  12),
+    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
-    set_xterm256_foreground(232,  96,  28),
+    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
    set_xterm256_foreground(241, 147,  45),
    set_xterm256_foreground(246, 193,  65),
    set_xterm256_foreground(247, 240,  86),
    set_xterm256_foreground(144, 201, 135),
    set_xterm256_foreground( 78, 178, 101),
 };
 //
--- a/examples/dr_wav.h
+++ b/examples/dr_wav.h
--- a/examples/ffmpeg-transcode.cpp
+++ b/examples/ffmpeg-transcode.cpp
@ -204,6 +204,8 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
    const size_t errbuffsize = 1024;
    char errbuff[errbuffsize];
    av_register_all(); // from avformat. Still a must-have call for ffmpeg v3! (can be skipped for later versions)
    fmt_ctx = avformat_alloc_context();
    avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
    LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);
@ -319,7 +321,7 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
        LOG("Couldn't map input file %s\n", ifname.c_str());
        return err;
    }
-    LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size);
+    LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
    struct audio_buffer inaudio_buf;
    inaudio_buf.ptr = ibuf;
    inaudio_buf.size = ibuf_size;
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
 fi
 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" "large-v3-turbo" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
 # list available models
 function list_models {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -997,7 +997,6 @@ int main(int argc, char ** argv) {
        if (params.dtw == "large.v1")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
        if (params.dtw == "large.v2")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
        if (params.dtw == "large.v3")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
        if (params.dtw == "large.v3.turbo")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO;
        if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
            fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
--- a/examples/python/whisper_processor.py
+++ b/examples/python/whisper_processor.py
@ -21,7 +21,7 @@ def process_audio(wav_file, model_name="base.en"):
    if not os.path.exists(wav_file):
        raise FileNotFoundError(f"WAV file not found: {wav_file}")
-    full_command = f"./main -m {model} -f {wav_file} -nt"
+    full_command = f"./main -m {model} -f {wav_file} -np -nt"
    # Execute the command
    process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -34,7 +34,6 @@ struct server_params
    std::string hostname = "127.0.0.1";
    std::string public_path = "examples/server/public";
    std::string request_path = "";
    std::string inference_path = "/inference";
    int32_t port          = 8080;
    int32_t read_timeout  = 600;
@ -133,7 +132,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
    fprintf(stderr, "  --request-path PATH,           [%-7s] Request path for all requests\n", sparams.request_path.c_str());
    fprintf(stderr, "  --inference-path PATH,         [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
    fprintf(stderr, "\n");
 }
@ -184,7 +182,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
        else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -219,7 +216,7 @@ void check_ffmpeg_availibility() {
 bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
    std::ostringstream cmd_stream;
    std::string converted_filename_temp = temp_filename + "_temp.wav";
-    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
    std::string cmd = cmd_stream.str();
    int status = std::system(cmd.c_str());
@ -647,10 +644,10 @@ int main(int argc, char ** argv) {
        return false;
    });
-    svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
+    svr.Options(sparams.request_path + "/inference", [&](const Request &, Response &){
    });
-    svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
+    svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
        // acquire whisper model mutex lock
        std::lock_guard<std::mutex> lock(whisper_mutex);
@ -677,8 +674,7 @@ int main(int argc, char ** argv) {
        if (sparams.ffmpeg_converter) {
            // if file is not wav, convert to wav
            // write to temporary file
-            const std::string temp_filename_base = std::tmpnam(nullptr);
+            const std::string temp_filename = "whisper_server_temp_file.wav";
            const std::string temp_filename = temp_filename_base + ".wav";
            std::ofstream temp_file{temp_filename, std::ios::binary};
            temp_file << audio_file.content;
            temp_file.close();
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@ -5,5 +5,5 @@
 set(TARGET ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -7,13 +7,10 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 #for FP16
-#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
 #for FP32
-cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #for other features from the examples, e.g. stream and talk link with SDL2:
 #cmake .. -DGGML_SYCL=ON -DWHISPER_SDL2=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main only
 #cmake --build . --config Release --target main
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,13 +1,7 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp
+    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
        llama.cpp
        llama-vocab.cpp
        llama-grammar.cpp
        llama-sampling.cpp
        unicode.cpp
        unicode-data.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    if (WHISPER_CLBLAST)
--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
--- a/examples/talk-llama/llama-grammar.h
+++ b/examples/talk-llama/llama-grammar.h
@ -1,144 +0,0 @@
 #pragma once
 #include "llama-impl.h"
 #include <map>
 struct llama_vocab;
 // grammar element type
 enum llama_gretype {
    // end of rule definition
    LLAMA_GRETYPE_END            = 0,
    // start of alternate definition for rule
    LLAMA_GRETYPE_ALT            = 1,
    // non-terminal element: reference to rule
    LLAMA_GRETYPE_RULE_REF       = 2,
    // terminal element: character (code point)
    LLAMA_GRETYPE_CHAR           = 3,
    // inverse char(s) ([^a], [^a-b] [^abc])
    LLAMA_GRETYPE_CHAR_NOT       = 4,
    // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
    // be an inclusive range ([a-z])
    LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
    // modifies a preceding LLAMA_GRETYPE_CHAR or
    // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
    LLAMA_GRETYPE_CHAR_ALT       = 6,
    // any character (.)
    LLAMA_GRETYPE_CHAR_ANY       = 7,
 };
 typedef struct llama_grammar_element {
    enum llama_gretype type;
    uint32_t           value; // Unicode code point or rule ID
 } llama_grammar_element;
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
 using llama_grammar_rule  = std::vector<      llama_grammar_element>;
 using llama_grammar_stack = std::vector<const llama_grammar_element *>;
 using llama_grammar_rules      = std::vector<llama_grammar_rule>;
 using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
 using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
 const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
 // takes a set of possible pushdown stacks on a grammar, which are required to
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions
 void llama_grammar_accept(
        const llama_grammar_rules  & rules,
        const llama_grammar_stacks & stacks,
                          uint32_t   chr,
              llama_grammar_stacks & stacks_new);
 std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
        const llama_grammar_rules      & rules,
        const llama_grammar_stack      & stack,
        const llama_grammar_candidates & candidates);
 struct llama_grammar_parser {
    std::map<std::string, uint32_t> symbol_ids;
    llama_grammar_rules rules;
    llama_grammar_stack c_rules() const;
    uint32_t get_symbol_id(const char * src, size_t len);
    uint32_t generate_symbol_id(const std::string & base_name);
    void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
    const char * parse_alternates(
            const char        * src,
            const std::string & rule_name,
            uint32_t            rule_id,
            bool                is_nested);
    const char * parse_sequence(
            const char         * src,
            const std::string  & rule_name,
            llama_grammar_rule & rule,
            bool               is_nested);
    const char * parse_rule(const char * src);
    bool parse(const char * src);
    void print(FILE * file);
 };
 struct llama_grammar {
    // note: allow null vocab for testing (not great)
    const llama_vocab * vocab;
    const llama_grammar_rules  rules;  // TODO: shared ptr
          llama_grammar_stacks stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8 partial_utf8;
 };
 //
 // internal API
 //
 // note: needed for tests (not great)
 struct llama_grammar * llama_grammar_init_impl(
        const struct llama_vocab * vocab,
        const llama_grammar_element ** rules,
        size_t n_rules,
        size_t start_rule_index);
 struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
 void llama_grammar_free_impl(struct llama_grammar * grammar);
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
 // TODO: move the API below as member functions of llama_grammar
 void llama_grammar_apply_impl(
        const struct llama_grammar & grammar,
            llama_token_data_array * cur_p);
 void llama_grammar_accept_impl(
              struct llama_grammar & grammar,
                       llama_token   token);
--- a/examples/talk-llama/llama-impl.h
+++ b/examples/talk-llama/llama-impl.h
@ -1,181 +0,0 @@
 #pragma once
 #include "llama.h"
 #include <string>
 #include <vector>
 #include <stdexcept>
 #ifdef __GNUC__
 #ifdef __MINGW32__
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...)
 #endif
 //
 // logging
 //
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
 void llama_log_internal        (ggml_log_level level, const char * format, ...);
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 #define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 //
 // helpers
 //
 struct time_meas {
    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
    ~time_meas() {
        if (t_start_us >= 0) {
            t_acc += ggml_time_us() - t_start_us;
        }
    }
    const int64_t t_start_us;
    int64_t & t_acc;
 };
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
    size_t last_pos = 0;
    while ((pos = s.find(search, last_pos)) != std::string::npos) {
        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 template<typename T>
 struct ring_buffer {
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
    T & front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    const T & front() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    T & back() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    const T & back() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    void push_back(const T & value) {
        if (capacity == 0) {
            throw std::runtime_error("ring buffer: capacity is zero");
        }
        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
        } else {
            sz++;
        }
        data[pos] = value;
        pos = (pos + 1) % capacity;
    }
    T pop_front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        T value = data[first];
        first = (first + 1) % capacity;
        sz--;
        return value;
    }
    //T & operator[](size_t i) {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    //const T & at(size_t i) const {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    const T & rat(size_t i) const {
        if (i >= sz) {
            throw std::runtime_error("ring buffer: index out of bounds");
        }
        return data[(first + sz - i - 1) % capacity];
    }
    std::vector<T> to_vector() const {
        std::vector<T> result;
        result.reserve(sz);
        for (size_t i = 0; i < sz; i++) {
            result.push_back(data[(first + i) % capacity]);
        }
        return result;
    }
    void clear() {
        // here only reset the status of the buffer
        sz = 0;
        first = 0;
        pos = 0;
    }
    bool empty() const {
        return sz == 0;
    }
    size_t size() const {
        return sz;
    }
    size_t capacity = 0;
    size_t sz = 0;
    size_t first = 0;
    size_t pos = 0;
    std::vector<T> data;
 };
--- a/examples/talk-llama/llama-sampling.cpp
+++ b/examples/talk-llama/llama-sampling.cpp
--- a/examples/talk-llama/llama-sampling.h
+++ b/examples/talk-llama/llama-sampling.h
@ -1,48 +0,0 @@
 #pragma once
 // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
 #include "llama-grammar.h"
 struct llama_vocab;
 struct llama_grammar;
 // sampler chain
 struct llama_sampler_chain {
    llama_sampler_chain_params params;
    std::vector<struct llama_sampler *> samplers;
    // timing
    mutable int64_t t_sample_us;
    mutable int32_t n_sample;
 };
 struct llama_sampler * llama_sampler_init_grammar_impl(
        const struct llama_vocab & vocab,
                      const char * grammar_str,
                      const char * grammar_root);
 struct llama_sampler * llama_sampler_init_infill_impl(
        const struct llama_vocab & vocab);
 struct llama_sampler * llama_sampler_init_dry_impl(
        const struct llama_vocab &  vocab,
                         int32_t    context_size,
                           float    dry_multiplier,
                           float    dry_base,
                         int32_t    dry_allowed_length,
                         int32_t    dry_penalty_last_n,
                      const char ** seq_breakers,
                          size_t    num_breakers);
 struct llama_sampler * llama_sampler_init_dry_testing(
                         int32_t   context_size,
                           float   dry_multiplier,
                           float   dry_base,
                         int32_t   dry_allowed_length,
                         int32_t   dry_penalty_last_n,
  const std::vector<std::vector<llama_token>>& seq_breakers);
--- a/examples/talk-llama/llama-vocab.cpp
+++ b/examples/talk-llama/llama-vocab.cpp
--- a/examples/talk-llama/llama-vocab.h
+++ b/examples/talk-llama/llama-vocab.h
@ -1,170 +0,0 @@
 #pragma once
 #include "llama-impl.h"
 #include <string>
 #include <vector>
 #include <unordered_map>
 #include <map>
 #include <set>
 struct llm_tokenizer;
 struct llama_vocab {
    using id    = llama_token;
    using token = std::string;
    using tattr = llama_token_attr;
    struct token_data {
        token text;
        float score;
        tattr attr;
    };
    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
    int max_token_len = 0; // used for optimizing longest token search
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
    std::vector<id>    cache_special_tokens;
    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
    // default LLaMA special tokens
    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
    id special_bos_id  = 1;
    id special_eos_id  = 2;
    id special_eot_id  = LLAMA_TOKEN_NULL;
    id special_eom_id  = LLAMA_TOKEN_NULL;
    id special_unk_id  = 0;
    id special_sep_id  = LLAMA_TOKEN_NULL;
    id special_pad_id  = LLAMA_TOKEN_NULL;
    id special_cls_id  = LLAMA_TOKEN_NULL;
    id special_mask_id = LLAMA_TOKEN_NULL;
    id linefeed_id = 13;
    // fim tokens
    id special_fim_pre_id = LLAMA_TOKEN_NULL;
    id special_fim_suf_id = LLAMA_TOKEN_NULL;
    id special_fim_mid_id = LLAMA_TOKEN_NULL;
    id special_fim_pad_id = LLAMA_TOKEN_NULL;
    id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
    id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
    // set of all tokens that cause "end of generation"
    std::set<id> special_eog_ids;
    // tokenizer flags
    bool tokenizer_add_space_prefix           = false;
    bool tokenizer_add_bos                    = false;
    bool tokenizer_add_eos                    = false;
    bool tokenizer_ignore_merges              = false;
    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
    bool tokenizer_remove_extra_whitespaces   = false;
    bool tokenizer_escape_whitespaces         = true;
    bool tokenizer_treat_whitespace_as_suffix = false;
    std::vector<char> precompiled_charsmap;
    llm_tokenizer * tokenizer = nullptr;
    llama_vocab() = default;
    ~llama_vocab();
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
    void init_tokenizer();
 };
 //
 // internal API
 //
 // TODO: rename to llama_tokenize_impl
 // TODO: This should probably be in llama.h
 std::vector<llama_vocab::id> llama_tokenize_internal(
        const llama_vocab & vocab,
        std::string raw_text,
        bool add_special,
        bool parse_special = false);
 // TODO: move the API below as member functions of llama_vocab
 llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
 const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
 float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
 llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
 bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
 llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
 llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
 llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
 llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
 llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
 llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
 bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
 bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
 int32_t llama_tokenize_impl(
        const struct llama_vocab & vocab,
                      const char * text,
                         int32_t   text_len,
                     llama_token * tokens,
                         int32_t   n_tokens_max,
                            bool   add_special,
                            bool   parse_special);
 // does not write null-terminator to buf
 int32_t llama_token_to_piece_impl(
        const struct llama_vocab & vocab,
                     llama_token   token,
                            char * buf,
                         int32_t   length,
                         int32_t   lstrip,
                            bool   special);
 // check if token0 is contained as a prefix in token1
 bool llama_token_is_prefix_impl(
        const struct llama_vocab & vocab,
                     llama_token   token0,
                     llama_token   token1);
 int32_t llama_detokenize_impl(
        const struct llama_vocab & vocab,
               const llama_token * tokens,
                         int32_t   n_tokens,
                            char * text,
                         int32_t   text_len_max,
                            bool   remove_special,
                            bool   unparse_special);
 std::string llama_detokenize(
        const struct llama_vocab & vocab,
  const std::vector<llama_token> & tokens,
                            bool   special);
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -314,6 +314,7 @@ int main(int argc, char ** argv) {
    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
    lcparams.n_threads  = params.n_threads;
    lcparams.flash_attn = params.flash_attn;
@ -401,26 +402,6 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
    // init sampler
    const float top_k = 5;
    const float top_p = 0.80f;
    const float temp  = 0.30f;
    const int seed = 0;
    auto sparams = llama_sampler_chain_default_params();
    llama_sampler * smpl = llama_sampler_chain_init(sparams);
    if (temp > 0.0f) {
        llama_sampler_chain_add(smpl, llama_sampler_init_top_k(top_k));
        llama_sampler_chain_add(smpl, llama_sampler_init_top_p(top_p, 1));
        llama_sampler_chain_add(smpl, llama_sampler_init_temp (temp));
        llama_sampler_chain_add(smpl, llama_sampler_init_dist (seed));
    } else {
        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
    }
    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
@ -436,7 +417,7 @@ int main(int argc, char ** argv) {
            session_tokens.resize(llama_n_ctx(ctx_llama));
            size_t n_token_count_out = 0;
-            if (!llama_state_load_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
@ -719,13 +700,54 @@ int main(int argc, char ** argv) {
                    {
                        // out of user input, sample next token
                        const float top_k          = 5;
                        const float top_p          = 0.80f;
                        const float temp           = 0.30f;
                        const float repeat_penalty = 1.1764f;
                        const int repeat_last_n    = 256;
                        if (!path_session.empty() && need_to_save_session) {
                            need_to_save_session = false;
-                            llama_state_save_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
                        }
-                        const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
+                        llama_token id = 0;
                        {
                            auto logits = llama_get_logits(ctx_llama);
                            auto n_vocab = llama_n_vocab(model_llama);
                            logits[llama_token_eos(model_llama)] = 0;
                            std::vector<llama_token_data> candidates;
                            candidates.reserve(n_vocab);
                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                            }
                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
                            // apply repeat penalty
                            const float nl_logit = logits[llama_token_nl(model_llama)];
                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);
                            logits[llama_token_nl(model_llama)] = nl_logit;
                            if (temp <= 0) {
                                // Greedy sampling
                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
                            } else {
                                // Temperature sampling
                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                                llama_sample_temp (ctx_llama, &candidates_p, temp);
                                id = llama_sample_token(ctx_llama, &candidates_p);
                            }
                        }
                        if (id != llama_token_eos(model_llama)) {
                            // add it to the context
@ -775,14 +797,8 @@ int main(int argc, char ** argv) {
    whisper_print_timings(ctx_wsp);
    whisper_free(ctx_wsp);
-    llama_perf_sampler_print(smpl);
+    llama_print_timings(ctx_llama);
    llama_perf_context_print(ctx_llama);
    llama_sampler_free(smpl);
    llama_batch_free(batch);
    llama_free(ctx_llama);
    llama_backend_free();
    return 0;
 }
--- a/examples/talk-llama/unicode-data.cpp
+++ b/examples/talk-llama/unicode-data.cpp
@ -7,7 +7,7 @@
 #include <unordered_map>
 #include <unordered_set>
-const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
+const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
 {0x000000, 0x0080},
 {0x000020, 0x0008},
 {0x000021, 0x0020},
@ -2311,8 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
 0x003000,
 };
-// list is always in ascending order, to enable binary search
+const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
 {0x000043, 0x000063},
@ -3748,8 +3747,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
 {0x01E921, 0x01E943},
 };
-// list is always in ascending order, to enable binary search
+const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
 {0x000063, 0x000043},
@ -5202,7 +5200,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase
 {0x01E943, 0x01E921},
 };
-const std::initializer_list<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
+const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
 {0x000000, 0x000000, 0x000000},
 {0x0000C0, 0x0000C5, 0x000041},
 {0x0000C7, 0x0000C7, 0x000043},
@ -7032,3 +7030,4 @@ const std::initializer_list<range_nfd> unicode_ranges_nfd = {  // start, last, n
 {0x02FA1C, 0x02FA1C, 0x009F3B},
 {0x02FA1D, 0x02FA1D, 0x02A600},
 };
--- a/examples/talk-llama/unicode-data.h
+++ b/examples/talk-llama/unicode-data.h
@ -13,8 +13,8 @@ struct range_nfd {
 static const uint32_t MAX_CODEPOINTS = 0x110000;
-extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
 extern const std::unordered_set<uint32_t> unicode_set_whitespace;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
-extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
+extern const std::vector<range_nfd> unicode_ranges_nfd;
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@ -1,11 +1,6 @@
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 #include "unicode.h"
 #include "unicode-data.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@ -20,12 +15,6 @@
 #include <locale>
 #include <codecvt>
 size_t unicode_len_utf8(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
 }
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
@ -34,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    return result;
 }
-uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];
@ -123,11 +112,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
 static std::vector<codepoint_flags> unicode_cpt_flags_array() {
    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
-    assert (unicode_ranges_flags.begin()[0].first == 0);
+    assert (unicode_ranges_flags.front().first == 0);
-    assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
+    assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
    for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
-        const auto range_ini = unicode_ranges_flags.begin()[i-1];  // codepoint_ini, flags
+        const auto range_ini = unicode_ranges_flags[i-1];  // codepoint_ini, flags
-        const auto range_end = unicode_ranges_flags.begin()[i];    // codepoint_end, flags
+        const auto range_end = unicode_ranges_flags[i];    // codepoint_end, flags
        for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
            cpt_flags[cpt] = range_ini.second;
        }
@ -243,7 +232,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
@ -305,9 +295,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+
-            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                pos += (cpt == ' ');
-                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
@ -361,7 +351,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
@ -403,8 +394,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
                }
            }
-            // regex: [^\r\n\p{L}\p{N}]?\p{L}+
+            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
-            if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
+            if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
                if (flags.is_letter || _get_flags(pos+1).is_letter) {  // one or more letters
                    pos++;
                    while (_get_flags(pos).is_letter) {
@ -430,9 +421,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
-            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
+            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                pos += (cpt == ' ');
-                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                    flags2 = _get_flags(++pos);
                }
                uint32_t cpt2 = _get_cpt(pos);
@ -597,7 +588,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
    std::vector<uint32_t> result(cpts.size());
    for (size_t i = 0; i < cpts.size(); ++i) {
        const uint32_t cpt = cpts[i];
-        auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
+        auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
        result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
    }
    return result;
@ -639,15 +630,8 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
 }
 uint32_t unicode_tolower(uint32_t cp) {
-    // binary search
+    auto it = unicode_map_lowercase.find(cp);
-    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+    return it == unicode_map_lowercase.end() ? cp : it->second;
        [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
            return pair.first < value;
        });
    if (it != unicode_map_lowercase.end() && it->first == cp) {
        return it->second;
    }
    return cp;  // Return the original code point if no lowercase mapping is found
 }
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@ -4,8 +4,6 @@
 #include <string>
 #include <vector>
 // TODO: prefix all symbols with "llama_"
 struct codepoint_flags {
    enum {
        UNDEFINED       = 0x0001,
@ -48,10 +46,8 @@ struct codepoint_flags {
    }
 };
 size_t unicode_len_utf8(char src);
 std::string unicode_cpt_to_utf8(uint32_t cp);
 uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@ -21,7 +21,7 @@ help()
    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
    echo "options:"
    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' 'large-v3-turbo' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
    echo "-t       Number of threads to use."
    echo "-h       Print this help page."
    echo
--- a/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
@ -7,9 +7,8 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
 set(SOURCE_FILES
    ${WHISPER_LIB_DIR}/ggml/src/ggml.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
-    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
    ${WHISPER_LIB_DIR}/src/whisper.cpp
    ${CMAKE_SOURCE_DIR}/jni.c
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@ -19,16 +19,9 @@ if (NOT GGML_HOME)
        SOURCE_FILES
        ${SOURCE_FILES}
        ${WHISPER_LIB_DIR}/ggml/src/ggml.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend-reg.cpp
        ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-threading.cpp
        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.cpp
        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
        )
 endif()
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -7,7 +7,6 @@
 	objects = {
 /* Begin PBXBuildFile section */
 		18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */; };
 		1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
 		1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
 		18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
@ -22,14 +21,8 @@
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
 		18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
 		18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
-		18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
+		18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
 		18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
 		18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; };
 		18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; };
 		18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; };
 		18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */; };
 		18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; };
 		18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; };
 		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
 		7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
 		7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
@ -51,12 +44,10 @@
 /* End PBXCopyFilesBuildPhase section */
 /* Begin PBXFileReference section */
 		18133C7E2C64E342005CEAAC /* ggml-aarch64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-aarch64.h"; path = "../../../ggml/src/ggml-aarch64.h"; sourceTree = "<group>"; };
 		18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
 		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
 		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
-		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal/ggml-metal.m"; sourceTree = "<group>"; };
+		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
-		1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal/ggml-metal.metal"; sourceTree = "<group>"; };
+		1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal.metal"; sourceTree = "<group>"; };
 		18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -79,20 +70,9 @@
 		18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
 		18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
 		18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
-		18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
+		18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
 		18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
 		18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
 		18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
 		18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
 		18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
 		18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = "<group>"; };
 		18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = "<group>"; };
 		18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = "<group>"; };
 		18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.c"; sourceTree = "<group>"; };
 		18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = "<group>"; };
 		18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = "<group>"; };
 		18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = "<group>"; };
 		18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-backend-reg.cpp"; path = "../../../ggml/src/ggml-backend-reg.cpp"; sourceTree = "<group>"; };
 		7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
 		7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
 		7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
@ -132,23 +112,10 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
 				18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
 				18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */,
 				18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */,
 				18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */,
 				18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */,
 				18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */,
 				18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */,
 				18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */,
 				18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */,
 				18E864AA2CE73C580094B8B3 /* ggml-cpu.h */,
 				18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */,
 				18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
 				18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
 				18A275FF2C2A9563001C8D37 /* ggml-common.h */,
 				18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
 				18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
-				18ABE1572AF556340044A204 /* ggml-backend.cpp */,
+				18ABE1572AF556340044A204 /* ggml-backend.c */,
 				18ABE1552AF556340044A204 /* ggml-backend.h */,
 				18ABE1582AF556340044A204 /* ggml-impl.h */,
 				18ABE1592AF556340044A204 /* ggml-quants.c */,
@ -269,22 +236,15 @@
 			files = (
 				18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
 				18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
 				18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */,
 				7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
 				18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
 				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
 				18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */,
 				18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */,
 				1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
-				18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */,
+				18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
 				18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */,
 				18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */,
 				18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
 				18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */,
 				1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
 				7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
 			);
@ -363,8 +323,6 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				GENERATE_INFOPLIST_FILE = YES;
 				HEADER_SEARCH_PATHS = ../../../ggml/src/;
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
@ -418,8 +376,6 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				GENERATE_INFOPLIST_FILE = YES;
 				HEADER_SEARCH_PATHS = ../../../ggml/src/;
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
@ -432,6 +388,64 @@
 			};
 			name = Release;
 		};
 		18627C9029052BE000BD2A04 /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
 				INFOPLIST_KEY_UIMainStoryboardFile = Main;
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
 				MTL_HEADER_SEARCH_PATHS = "";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
 		18627C9129052BE000BD2A04 /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
 				INFOPLIST_KEY_UIMainStoryboardFile = Main;
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
 				MTL_HEADER_SEARCH_PATHS = "";
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
 /* End XCBuildConfiguration section */
 /* Begin XCConfigurationList section */
@ -444,6 +458,15 @@
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
 		18627C8F29052BE000BD2A04 /* Build configuration list for PBXNativeTarget "whisper.objc" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				18627C9029052BE000BD2A04 /* Debug */,
 				18627C9129052BE000BD2A04 /* Release */,
 			);
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
 	};
 	rootObject = 18627C6E29052BDF00BD2A04 /* Project object */;
--- a/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
+++ b/examples/whisper.swiftui/whisper.cpp.swift/LibWhisper.swift
@ -1,5 +1,4 @@
 import Foundation
 import UIKit
 import whisper
 enum WhisperError: Error {
@ -56,91 +55,11 @@ actor WhisperContext {
        return transcription
    }
    static func benchMemcpy(nThreads: Int32) async -> String {
        return String.init(cString: whisper_bench_memcpy_str(nThreads))
    }
    static func benchGgmlMulMat(nThreads: Int32) async -> String {
        return String.init(cString: whisper_bench_ggml_mul_mat_str(nThreads))
    }
    private func systemInfo() -> String {
        var info = ""
        if (ggml_cpu_has_neon() != 0) { info += "NEON " }
        return String(info.dropLast())
    }
    func benchFull(modelName: String, nThreads: Int32) async -> String {
        let nMels = whisper_model_n_mels(context)
        if (whisper_set_mel(context, nil, 0, nMels) != 0) {
            return "error: failed to set mel"
        }
        // heat encoder
        if (whisper_encode(context, 0, nThreads) != 0) {
            return "error: failed to encode"
        }
        var tokens = [whisper_token](repeating: 0, count: 512)
        // prompt heat
        if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) {
            return "error: failed to decode"
        }
        // text-generation heat
        if (whisper_decode(context, &tokens, 1, 256, nThreads) != 0) {
            return "error: failed to decode"
        }
        whisper_reset_timings(context)
        // actual run
        if (whisper_encode(context, 0, nThreads) != 0) {
            return "error: failed to encode"
        }
        // text-generation
        for i in 0..<256 {
            if (whisper_decode(context, &tokens, 1, Int32(i), nThreads) != 0) {
                return "error: failed to decode"
            }
        }
        // batched decoding
        for _ in 0..<64 {
            if (whisper_decode(context, &tokens, 5, 0, nThreads) != 0) {
                return "error: failed to decode"
            }
        }
        // prompt processing
        for _ in 0..<16 {
            if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) {
                return "error: failed to decode"
            }
        }
        whisper_print_timings(context)
        let deviceModel = await UIDevice.current.model
        let systemName = await UIDevice.current.systemName
        let systemInfo = self.systemInfo()
        let timings: whisper_timings = whisper_get_timings(context).pointee
        let encodeMs = String(format: "%.2f", timings.encode_ms)
        let decodeMs = String(format: "%.2f", timings.decode_ms)
        let batchdMs = String(format: "%.2f", timings.batchd_ms)
        let promptMs = String(format: "%.2f", timings.prompt_ms)
        return "| \(deviceModel) | \(systemName) | \(systemInfo) | \(modelName) | \(nThreads) | 1 | \(encodeMs) | \(decodeMs) | \(batchdMs) | \(promptMs) | <todo> |"
    }
    static func createContext(path: String) throws -> WhisperContext {
        var params = whisper_context_default_params()
 #if targetEnvironment(simulator)
        params.use_gpu = false
        print("Running on the simulator, using CPU")
 #else
        params.flash_attn = true // Enabled by default for Metal
 #endif
        let context = whisper_init_from_file_with_params(path, params)
        if let context {
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Models/Model.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Models/Model.swift
@ -1,17 +0,0 @@
 import Foundation
 struct Model: Identifiable {
    var id = UUID()
    var name: String
    var info: String
    var url: String
    var filename: String
    var fileURL: URL {
        FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
    }
    func fileExists() -> Bool {
        FileManager.default.fileExists(atPath: fileURL.path)
    }
 }
--- a/examples/whisper.swiftui/whisper.swiftui.demo/Models/WhisperState.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/Models/WhisperState.swift
@ -14,7 +14,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
    private var recordedFile: URL? = nil
    private var audioPlayer: AVAudioPlayer?
-    private var builtInModelUrl: URL? {
+    private var modelUrl: URL? {
        Bundle.main.url(forResource: "ggml-base.en", withExtension: "bin", subdirectory: "models")
    }
@ -28,59 +28,23 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
    override init() {
        super.init()
        loadModel()
    }
    func loadModel(path: URL? = nil, log: Bool = true) {
        do {
-            whisperContext = nil
+            try loadModel()
            if (log) { messageLog += "Loading model...\n" }
            let modelUrl = path ?? builtInModelUrl
            if let modelUrl {
                whisperContext = try WhisperContext.createContext(path: modelUrl.path())
                if (log) { messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" }
            } else {
                if (log) { messageLog += "Could not locate model\n" }
            }
            canTranscribe = true
        } catch {
            print(error.localizedDescription)
-            if (log) { messageLog += "\(error.localizedDescription)\n" }
+            messageLog += "\(error.localizedDescription)\n"
        }
    }
-    func benchCurrentModel() async {
+    private func loadModel() throws {
-        if whisperContext == nil {
+        messageLog += "Loading model...\n"
-            messageLog += "Cannot bench without loaded model\n"
+        if let modelUrl {
-            return
+            whisperContext = try WhisperContext.createContext(path: modelUrl.path())
            messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
        } else {
            messageLog += "Could not locate model\n"
        }
        messageLog += "Running benchmark for loaded model\n"
        let result = await whisperContext?.benchFull(modelName: "<current>", nThreads: Int32(min(4, cpuCount())))
        if (result != nil) { messageLog += result! + "\n" }
    }
    func bench(models: [Model]) async {
        let nThreads = Int32(min(4, cpuCount()))
 //        messageLog += "Running memcpy benchmark\n"
 //        messageLog += await WhisperContext.benchMemcpy(nThreads: nThreads) + "\n"
 //
 //        messageLog += "Running ggml_mul_mat benchmark with \(nThreads) threads\n"
 //        messageLog += await WhisperContext.benchGgmlMulMat(nThreads: nThreads) + "\n"
        messageLog += "Running benchmark for all downloaded models\n"
        messageLog += "| CPU | OS | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |\n"
        messageLog += "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n"
        for model in models {
            loadModel(path: model.fileURL, log: false)
            if whisperContext == nil {
                messageLog += "Cannot bench without loaded model\n"
                break
            }
            let result = await whisperContext?.benchFull(modelName: model.name, nThreads: nThreads)
            if (result != nil) { messageLog += result! + "\n" }
        }
        messageLog += "Benchmarking completed\n"
    }
    func transcribeSample() async {
@ -196,8 +160,3 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
        isRecording = false
    }
 }
 fileprivate func cpuCount() -> Int {
    ProcessInfo.processInfo.processorCount
 }
--- a/examples/whisper.swiftui/whisper.swiftui.demo/UI/ContentView.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/UI/ContentView.swift
@ -1,6 +1,5 @@
 import SwiftUI
 import AVFoundation
 import Foundation
 struct ContentView: View {
    @StateObject var whisperState = WhisperState()
@ -30,125 +29,15 @@ struct ContentView: View {
                    Text(verbatim: whisperState.messageLog)
                        .frame(maxWidth: .infinity, alignment: .leading)
                }
                .font(.footnote)
                .padding()
                .background(Color.gray.opacity(0.1))
                .cornerRadius(10)
                HStack {
                    Button("Clear Logs", action: {
                        whisperState.messageLog = ""
                    })
                    .font(.footnote)
                    .buttonStyle(.bordered)
                    Button("Copy Logs", action: {
                        UIPasteboard.general.string = whisperState.messageLog
                    })
                    .font(.footnote)
                    .buttonStyle(.bordered)
                    Button("Bench", action: {
                        Task {
                            await whisperState.benchCurrentModel()
                        }
                    })
                    .font(.footnote)
                    .buttonStyle(.bordered)
                    .disabled(!whisperState.canTranscribe)
                    Button("Bench All", action: {
                        Task {
                            await whisperState.bench(models: ModelsView.getDownloadedModels())
                        }
                    })
                    .font(.footnote)
                    .buttonStyle(.bordered)
                    .disabled(!whisperState.canTranscribe)
                }
                NavigationLink(destination: ModelsView(whisperState: whisperState)) {
                    Text("View Models")
                }
                .font(.footnote)
                .padding()
            }
            .navigationTitle("Whisper SwiftUI Demo")
            .padding()
        }
    }
    struct ModelsView: View {
        @ObservedObject var whisperState: WhisperState
        @Environment(\.dismiss) var dismiss
        private static let models: [Model] = [
            Model(name: "tiny", info: "(F16, 75 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin", filename: "tiny.bin"),
            Model(name: "tiny-q5_1", info: "(31 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q5_1.bin", filename: "tiny-q5_1.bin"),
            Model(name: "tiny-q8_0", info: "(42 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q8_0.bin", filename: "tiny-q8_0.bin"),
            Model(name: "tiny.en", info: "(F16, 75 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin", filename: "tiny.en.bin"),
            Model(name: "tiny.en-q5_1", info: "(31 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin", filename: "tiny.en-q5_1.bin"),
            Model(name: "tiny.en-q8_0", info: "(42 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q8_0.bin", filename: "tiny.en-q8_0.bin"),
            Model(name: "base", info: "(F16, 142 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin", filename: "base.bin"),
            Model(name: "base-q5_1", info: "(57 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base-q5_1.bin", filename: "base-q5_1.bin"),
            Model(name: "base-q8_0", info: "(78 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base-q8_0.bin", filename: "base-q8_0.bin"),
            Model(name: "base.en", info: "(F16, 142 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin", filename: "base.en.bin"),
            Model(name: "base.en-q5_1", info: "(57 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q5_1.bin", filename: "base.en-q5_1.bin"),
            Model(name: "base.en-q8_0", info: "(78 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q8_0.bin", filename: "base.en-q8_0.bin"),
            Model(name: "small", info: "(F16, 466 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin", filename: "small.bin"),
            Model(name: "small-q5_1", info: "(181 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small-q5_1.bin", filename: "small-q5_1.bin"),
            Model(name: "small-q8_0", info: "(252 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small-q8_0.bin", filename: "small-q8_0.bin"),
            Model(name: "small.en", info: "(F16, 466 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin", filename: "small.en.bin"),
            Model(name: "small.en-q5_1", info: "(181 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q5_1.bin", filename: "small.en-q5_1.bin"),
            Model(name: "small.en-q8_0", info: "(252 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q8_0.bin", filename: "small.en-q8_0.bin"),
            Model(name: "medium", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin", filename: "medium.bin"),
            Model(name: "medium-q5_0", info: "(514 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin", filename: "medium-q5_0.bin"),
            Model(name: "medium-q8_0", info: "(785 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q8_0.bin", filename: "medium-q8_0.bin"),
            Model(name: "medium.en", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin", filename: "medium.en.bin"),
            Model(name: "medium.en-q5_0", info: "(514 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q5_0.bin", filename: "medium.en-q5_0.bin"),
            Model(name: "medium.en-q8_0", info: "(785 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q8_0.bin", filename: "medium.en-q8_0.bin"),
            Model(name: "large-v1", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large.bin", filename: "large.bin"),
            Model(name: "large-v2", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin", filename: "large-v2.bin"),
            Model(name: "large-v2-q5_0", info: "(1.1 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q5_0.bin", filename: "large-v2-q5_0.bin"),
            Model(name: "large-v2-q8_0", info: "(1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q8_0.bin", filename: "large-v2-q8_0.bin"),
            Model(name: "large-v3", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin", filename: "large-v3.bin"),
            Model(name: "large-v3-q5_0", info: "(1.1 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-q5_0.bin", filename: "large-v3-q5_0.bin"),
            Model(name: "large-v3-turbo", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin", filename: "large-v3-turbo.bin"),
            Model(name: "large-v3-turbo-q5_0", info: "(547 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin", filename: "large-v3-turbo-q5_0.bin"),
            Model(name: "large-v3-turbo-q8_0", info: "(834 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q8_0.bin", filename: "large-v3-turbo-q8_0.bin"),
        ]
        static func getDownloadedModels() -> [Model] {
            // Filter models that have been downloaded
            return models.filter {
                FileManager.default.fileExists(atPath: $0.fileURL.path())
            }
        }
        func loadModel(model: Model) {
            Task {
                dismiss()
                whisperState.loadModel(path: model.fileURL)
            }
        }
        var body: some View {
            List {
                Section(header: Text("Models")) {
                    ForEach(ModelsView.models) { model in
                        DownloadButton(model: model)
                            .onLoad(perform: loadModel)
                    }
                }
            }
            .listStyle(GroupedListStyle())
            .navigationBarTitle("Models", displayMode: .inline).toolbar {}
        }
    }
 }
-//struct ContentView_Previews: PreviewProvider {
+struct ContentView_Previews: PreviewProvider {
-//    static var previews: some View {
+    static var previews: some View {
-//        ContentView()
+        ContentView()
-//    }
+    }
-//}
+}
--- a/examples/whisper.swiftui/whisper.swiftui.demo/UI/DownloadButton.swift
+++ b/examples/whisper.swiftui/whisper.swiftui.demo/UI/DownloadButton.swift
@ -1,102 +0,0 @@
 import SwiftUI
 struct DownloadButton: View {
    private var model: Model
    @State private var status: String
    @State private var downloadTask: URLSessionDownloadTask?
    @State private var progress = 0.0
    @State private var observation: NSKeyValueObservation?
    private var onLoad: ((_ model: Model) -> Void)?
    init(model: Model) {
        self.model = model
        status = model.fileExists() ? "downloaded" : "download"
    }
    func onLoad(perform action: @escaping (_ model: Model) -> Void) -> DownloadButton {
        var button = self
        button.onLoad = action
        return button
    }
    private func download() {
        status = "downloading"
        print("Downloading model \(model.name) from \(model.url)")
        guard let url = URL(string: model.url) else { return }
        downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
            if let error = error {
                print("Error: \(error.localizedDescription)")
                return
            }
            guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
                print("Server error!")
                return
            }
            do {
                if let temporaryURL = temporaryURL {
                    try FileManager.default.copyItem(at: temporaryURL, to: model.fileURL)
                    print("Writing to \(model.filename) completed")
                    status = "downloaded"
                }
            } catch let err {
                print("Error: \(err.localizedDescription)")
            }
        }
        observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
            self.progress = progress.fractionCompleted
        }
        downloadTask?.resume()
    }
    var body: some View {
        VStack {
            Button(action: {
                if (status == "download") {
                    download()
                } else if (status == "downloading") {
                    downloadTask?.cancel()
                    status = "download"
                } else if (status == "downloaded") {
                    if !model.fileExists() {
                        download()
                    }
                    onLoad?(model)
                }
            }) {
                let title = "\(model.name) \(model.info)"
                if (status == "download") {
                    Text("Download \(title)")
                } else if (status == "downloading") {
                    Text("\(title) (Downloading \(Int(progress * 100))%)")
                } else if (status == "downloaded") {
                    Text("Load \(title)")
                } else {
                    Text("Unknown status")
                }
            }.swipeActions {
                if (status == "downloaded") {
                    Button("Delete") {
                        do {
                            try FileManager.default.removeItem(at: model.fileURL)
                        } catch {
                            print("Error deleting file: \(error)")
                        }
                        status = "download"
                    }
                    .tint(.red)
                }
            }
        }
        .onDisappear() {
            downloadTask?.cancel()
        }
    }
 }
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -17,8 +17,6 @@
 		0AAC5D9F29539CD0003032C3 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9E29539CD0003032C3 /* Assets.xcassets */; };
 		0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DCD2953A05C003032C3 /* WhisperState.swift */; };
 		0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DD02953A394003032C3 /* LibWhisper.swift */; };
 		7F79E0EE2CE0A78000ACD7BF /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */; };
 		7F79E0F02CE0C6F700ACD7BF /* Model.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F79E0EF2CE0C6F700ACD7BF /* Model.swift */; };
 		E3F92DC52AFA8E3800A6A9D4 /* whisper in Frameworks */ = {isa = PBXBuildFile; productRef = E3F92DC42AFA8E3800A6A9D4 /* whisper */; };
 /* End PBXBuildFile section */
@ -35,8 +33,6 @@
 		0AAC5DA029539CD0003032C3 /* WhisperCppDemo.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = WhisperCppDemo.entitlements; sourceTree = "<group>"; };
 		0AAC5DCD2953A05C003032C3 /* WhisperState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperState.swift; sourceTree = "<group>"; };
 		0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = "<group>"; };
 		7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
 		7F79E0EF2CE0C6F700ACD7BF /* Model.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Model.swift; sourceTree = "<group>"; };
 		E3F92DC22AFA8DD800A6A9D4 /* whisper.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = whisper.cpp; path = ../..; sourceTree = "<group>"; };
 /* End PBXFileReference section */
@ -56,7 +52,6 @@
 			isa = PBXGroup;
 			children = (
 				0AAC5DCD2953A05C003032C3 /* WhisperState.swift */,
 				7F79E0EF2CE0C6F700ACD7BF /* Model.swift */,
 			);
 			path = Models;
 			sourceTree = "<group>";
@ -124,7 +119,6 @@
 			isa = PBXGroup;
 			children = (
 				0AAC5D9C29539CCF003032C3 /* ContentView.swift */,
 				7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */,
 			);
 			path = UI;
 			sourceTree = "<group>";
@ -226,9 +220,7 @@
 				0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */,
 				0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */,
 				0AA7514C2953B569001EE061 /* RiffWaveUtils.swift in Sources */,
 				7F79E0EE2CE0A78000ACD7BF /* DownloadButton.swift in Sources */,
 				0AA7514E2953D958001EE061 /* Recorder.swift in Sources */,
 				7F79E0F02CE0C6F700ACD7BF /* Model.swift in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
@ -378,9 +370,7 @@
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = YES;
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_OPTIMIZATION_LEVEL = "-Onone";
 				SWIFT_VERSION = 5.0;
@ -425,9 +415,7 @@
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
-				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
+				SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
 				SUPPORTS_MACCATALYST = NO;
 				SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = YES;
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				SWIFT_VERSION = 5.0;
 				TARGETED_DEVICE_FAMILY = "1,2";
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -50,24 +50,9 @@ else()
    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 if (CMAKE_CROSSCOMPILING)
    set(GGML_NATIVE_DEFAULT OFF)
 else()
    set(GGML_NATIVE_DEFAULT ON)
 endif()
 # defaults
 if (NOT GGML_LLAMAFILE_DEFAULT)
    set(GGML_LLAMAFILE_DEFAULT OFF)
 endif()
 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
 endif()
 # general
 option(GGML_STATIC "ggml: static link libraries"         OFF)
-option(GGML_NATIVE "ggml: enable -march=native flag"     ${GGML_NATIVE_DEFAULT})
+option(GGML_NATIVE "ggml: enable -march=native flag"     ON)
 option(GGML_LTO    "ggml: enable link time optimization" OFF)
 option(GGML_CCACHE "ggml: use ccache if available"       ON)
@ -85,14 +70,13 @@ option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
 option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
 # instruction set specific
-if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
+if (GGML_NATIVE)
    set(INS_ENB OFF)
 else()
    set(INS_ENB ON)
 endif()
 option(GGML_CPU_HBM     "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_AVX         "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX2        "ggml: enable AVX2"             ${INS_ENB})
@ -100,9 +84,6 @@ option(GGML_AVX512      "ggml: enable AVX512"           OFF)
 option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI"      OFF)
 option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI"      OFF)
 option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16"      OFF)
 option(GGML_AMX_TILE    "ggml: enable AMX-TILE"         OFF)
 option(GGML_AMX_INT8    "ggml: enable AMX-INT8"         OFF)
 option(GGML_AMX_BF16    "ggml: enable AMX-BF16"         OFF)
 option(GGML_FMA         "ggml: enable FMA"              ${INS_ENB})
 if (NOT MSVC)
    option(GGML_F16C    "ggml: enable F16C"             ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
@ -117,40 +98,39 @@ endif()
 # ggml core
 set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
 option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
 # 3rd party libs / backends
 option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
+option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"                            OFF)
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
-option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
+option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
-option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
+set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
 set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
                                            "ggml: iters./thread per block for Q2_K/Q6_K")
 set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
-option(GGML_HIP                             "ggml: use HIP"                                   OFF)
+option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
 option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
 option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
 option(GGML_METAL_USE_BF16                  "ggml: use bfloat if available"                   OFF)
 option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
 option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
 option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
@ -159,13 +139,10 @@ set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
 set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
 option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
 option(GGML_RPC                             "ggml: use RPC"                                   OFF)
 option(GGML_AMX                             "ggml: use AMX"                                   OFF)
 option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
 option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
 set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
 set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                            "ggml: sycl device architecture")
 # extra artifacts
 option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
@ -215,34 +192,27 @@ endif ()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-cpu.h
    include/ggml-alloc.h
    include/ggml-backend.h
-    include/ggml-blas.h
+    "${GGML_HEADERS_CUDA}"
-    include/ggml-cann.h
+    "${GGML_HEADERS_METAL}"
-    include/ggml-cuda.h
+    "${GGML_HEADERS_EXTRA}")
    include/ggml-kompute.h
    include/ggml-opt.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
    include/ggml-vulkan.h)
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
 #    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
 #endif()
-install(TARGETS ggml LIBRARY PUBLIC_HEADER)
+install(TARGETS ggml PUBLIC_HEADER)
-install(TARGETS ggml-base LIBRARY)
+
 if (BUILD_SHARED_LIBS)
    install(TARGETS ggml LIBRARY)
 endif()
 # FIXME: this should be done in the backend cmake files
 if (GGML_METAL)
    # FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
    install(
-        FILES src/ggml-metal/ggml-metal.metal
+        FILES src/ggml-metal.metal
        PERMISSIONS
            OWNER_READ
            OWNER_WRITE
--- a/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
+++ b/ggml/src/ggml-cpu/cmake/FindSIMD.cmake
--- a/ggml/ggml_vk_generate_shaders.py
+++ b/ggml/ggml_vk_generate_shaders.py
@ -0,0 +1,220 @@
 #!/usr/bin/env python
 import logging
 import argparse
 import asyncio
 import os
 from tempfile import gettempdir
 logger = logging.getLogger("ggml-vk-generate-shaders")
 GLSLC = "glslc"
 type_names = [
    "f32",
    "f16",
    "q4_0",
    "q4_1",
    "q5_0",
    "q5_1",
    "q8_0",
    "q2_k",
    "q3_k",
    "q4_k",
    "q5_k",
    "q6_k",
 ]
 ASYNCIO_CONCURRENCY = 64
 input_dir = "vulkan-shaders"
 output_dir = gettempdir()
 lock = asyncio.Lock()
 shader_fnames = []
 async def string_to_spv(name, in_fname, defines, fp16=True):
    name = f"{name}{'_fp32' if not fp16 else ''}"
    out_fname = os.path.join(output_dir, f"{name}.spv")
    in_path = os.path.join(input_dir, in_fname)
    cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname]
    cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
    proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
    stdout, stderr = await proc.communicate()
    stdout = stdout.decode()
    error = stderr.decode()
    if proc.returncode:
        cmd = " ".join(cmd)
        logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}")
        return
    async with lock:
        shader_fnames.append((name, out_fname))
 def matmul_shaders(tasks, fp16, matmul_id):
    if fp16:
        load_vec = "8"
        aligned_b_type_f32 = "mat2x4"
        aligned_b_type_f16 = "f16mat2x4"
    else:
        load_vec = "4"
        aligned_b_type_f32 = "vec4"
        aligned_b_type_f16 = "f16vec4"
    base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"}
    shader_name = "matmul"
    if matmul_id:
        base_dict["MUL_MAT_ID"] = "1"
        shader_name = "matmul_id"
    if fp16:
        base_dict["FLOAT16"] = "1"
    # Shaders with f16 B_TYPE
    tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
    tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
    tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
    tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
    for tname in type_names:
        data_a_key = f"DATA_A_{tname.upper()}"
        load_vec_a = load_vec if tname in ("f32", "f16") else "2"
        tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
        tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16))
 async def main():
    logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
    tasks = []
    for fp16 in (False, True):
        # MUL_MAT
        matmul_shaders(tasks, fp16, False)
        # MUL_MAT_ID
        matmul_shaders(tasks, fp16, True)
    for tname in type_names:
        base_dict = {"FLOAT_TYPE": "float"}
        # mul mat vec
        data_a_key = f"DATA_A_{tname.upper()}"
        shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
        tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
        tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"}))
        tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
        # Dequant shaders
        if tname != "f16":
            tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"}))
        # get_rows
        if not tname.endswith("_k"):
            shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp"
            if tname == "f16":
                tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
            else:
                tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"}))
            tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"}))
    tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
    # Norms
    tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
    tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
    tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
    tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}))
    tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
    tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
    tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
    tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
    tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
    tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}))
    tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
    tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
    tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
    tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"}))
    tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
    # Helper to decorate tasks with semaphore acquisition.
    async def withSemaphore(sem, task):
        async with sem:
            return await task
    # Run tasks concurrently guarded by a concurrency limit.
    sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
    await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
    with open("ggml-vulkan-shaders.hpp", "w") as f:
        f.write("#include <cstdint>\n\n")
        for name, path in sorted(shader_fnames):
            with open(path, "rb") as spv:
                counter = 0
                newline_counter = 0
                f.write(f"unsigned char {name}_data[] = {{\n")
                for val in spv.read():
                    f.write(f"0x{val:02x},")
                    newline_counter += 1
                    counter += 1
                    if newline_counter >= 12:
                        newline_counter = 0
                        f.write("\n")
            f.write("\n};\n")
            f.write(f"const uint64_t {name}_len = {counter};\n\n")
            os.remove(path)
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
    parser.add_argument("--glslc", help="Path to glslc")
    parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
    args = parser.parse_args()
    logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
    if args.glslc:
        GLSLC = args.glslc
    asyncio.run(main())
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@ -7,8 +7,8 @@ extern "C" {
 #endif
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct             ggml_backend * ggml_backend_t;
+typedef struct ggml_backend * ggml_backend_t;
 // Tensor allocator
 struct ggml_tallocr {
@ -24,7 +24,7 @@ GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
 // Graph allocator
 /*
  Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
    ggml_gallocr_reserve(galloc, build_graph(max_batch));
--- a/ggml/include/ggml-amx.h
+++ b/ggml/include/ggml-amx.h
@ -1,25 +0,0 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 // buffer_type API
 GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
 GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
 // backend API
 GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
 GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/Show More
+++ b/Show More