ggml : aligned malloc -> malloc

ggml : allocate contexts on the heap (v2)
whisper : reduce ggml_context usage
2025-08-18 00:30:34 +02:00 · 2024-10-31 21:40:11 +02:00 · 2024-10-31 21:29:48 +02:00 · 2024-10-30 13:39:14 +02:00 · 2024-10-29 19:37:24 +02:00 · 2024-10-29 19:30:26 +02:00
288 changed files with 54576 additions and 185036 deletions
--- a/.devops/cublas.Dockerfile
+++ b/.devops/cublas.Dockerfile
@@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
-    apt-get install -y build-essential git cmake
+    apt-get install -y build-essential git cmake libsdl2-dev
 WORKDIR /app
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -17,7 +17,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 ENV GGML_CUDA=1
 RUN apt-get update && \
-    apt-get install -y build-essential \
+    apt-get install -y build-essential libsdl2-dev \
    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 # Ref: https://stackoverflow.com/a/53464012
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -12,7 +12,7 @@ FROM ubuntu:22.04 AS runtime
 WORKDIR /app
 RUN apt-get update && \
-  apt-get install -y curl ffmpeg \
+  apt-get install -y curl ffmpeg libsdl2-dev \
  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 COPY --from=build /app /app
--- a/.github/workflows/bindings-go.yml
+++ b/.github/workflows/bindings-go.yml
@@ -13,10 +13,10 @@ jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/setup-go@v3
+      - uses: actions/setup-go@v5
        with:
-          go-version: '^1.19'
+          go-version: '^1.23'
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v4
      - run: |
          cd bindings/go
          make test
--- a/.github/workflows/bindings-ruby.yml
+++ b/.github/workflows/bindings-ruby.yml
@@ -3,20 +3,73 @@ on:
  push:
    paths:
      - bindings/ruby/**
-      - whisper.h
+      - src/whisper.cpp
      - include/whisper.h
      - ggml/src/ggml.c
      - ggml/src/ggml-impl.h
      - ggml/src/ggml-aarch64.h
      - ggml/src/ggml-aarch64.c
      - ggml/src/ggml-alloc.c
      - ggml/src/ggml-backend-impl.h
      - ggml/src/ggml-backend.cpp
      - ggml/src/ggml-common.h
      - ggml/src/ggml-quants.h
      - ggml/src/ggml-quants.c
      - ggml/src/ggml-cpu-impl.h
      - ggml/src/ggml-metal.m
      - ggml/src/ggml-metal.metal
      - ggml/src/ggml-blas.cpp
      - ggml/include/ggml.h
      - ggml/include/ggml-alloc.h
      - ggml/include/ggml-backend.h
      - ggml/include/ggml-cuda.h
      - ggml/include/ggml-kompute.h
      - ggml/include/ggml-metal.h
      - ggml/include/ggml-sycl.h
      - ggml/include/ggml-vulkan.h
      - ggml/include/ggml-blas.h
      - scripts/get-flags.mk
      - examples/dr_wav.h
  pull_request:
    paths:
      - bindings/ruby/**
-      - whisper.h
+      - src/whisper.cpp
      - include/whisper.h
      - ggml/src/ggml.c
      - ggml/src/ggml-impl.h
      - ggml/src/ggml-aarch64.h
      - ggml/src/ggml-aarch64.c
      - ggml/src/ggml-alloc.c
      - ggml/src/ggml-backend-impl.h
      - ggml/src/ggml-backend.cpp
      - ggml/src/ggml-common.h
      - ggml/src/ggml-quants.h
      - ggml/src/ggml-quants.c
      - ggml/src/ggml-cpu-impl.h
      - ggml/src/ggml-metal.m
      - ggml/src/ggml-metal.metal
      - ggml/src/ggml-blas.cpp
      - ggml/include/ggml.h
      - ggml/include/ggml-alloc.h
      - ggml/include/ggml-backend.h
      - ggml/include/ggml-cuda.h
      - ggml/include/ggml-kompute.h
      - ggml/include/ggml-metal.h
      - ggml/include/ggml-sycl.h
      - ggml/include/ggml-vulkan.h
      - ggml/include/ggml-blas.h
      - scripts/get-flags.mk
      - examples/dr_wav.h
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    defaults:
      run:
        working-directory: bindings/ruby
    steps:
      - uses: ruby/setup-ruby@v1
        with:
          ruby-version: '3.0'
-      - uses: actions/checkout@v1
+      - uses: actions/checkout@v4
-      - run: |
+      - run: rake test
          cd bindings/ruby/ext
          ruby extconf.rb && make
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -59,7 +59,7 @@ jobs:
        uses: cross-platform-actions/action@v0.24.0
        with:
          operating_system: freebsd
-          version: '13.2'
+          version: '13.3'
          run: |
            sudo pkg update
            sudo pkg install -y gmake sdl2
@@ -586,73 +586,75 @@ jobs:
          cd whisper/examples/whisper.android
          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
-  android_java:
+# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
-    runs-on: ubuntu-latest
+#  android_java:
 #    runs-on: ubuntu-latest
 #
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
 #
 #      - name: set up JDK 11
 #        uses: actions/setup-java@v4
 #        with:
 #          java-version: '11'
 #          distribution: 'temurin'
 #          cache: gradle
 #
 #      - name: Setup Android SDK
 #        uses: android-actions/setup-android@v3
 #        with:
 #          cmdline-tools-version: 9.0
 #
 #      - name: Build
 #        run: |
 #          cd examples/whisper.android.java
 #          chmod +x ./gradlew
 #          ./gradlew assembleRelease
-    steps:
+# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
-      - name: Clone
+#  java:
-        uses: actions/checkout@v4
+#    needs: [ 'windows' ]
-
+#    runs-on: windows-latest
-      - name: set up JDK 11
+#    steps:
-        uses: actions/setup-java@v4
+#      - uses: actions/checkout@v4
-        with:
+#
-          java-version: '11'
+#      - name: Install Java
-          distribution: 'temurin'
+#        uses: actions/setup-java@v4
-          cache: gradle
+#        with:
-
+#          distribution: zulu
-      - name: Setup Android SDK
+#          java-version: 20
-        uses: android-actions/setup-android@v3
+#
-        with:
+#      - name: Download Windows lib
-          cmdline-tools-version: 9.0
+#        uses: actions/download-artifact@v4
-
+#        with:
-      - name: Build
+#          name: win32-x86-64_whisper.dll
-        run: |
+#          path: bindings/java/build/generated/resources/main/win32-x86-64
-          cd examples/whisper.android.java
+#
-          chmod +x ./gradlew
+#      - name: Build
-          ./gradlew assembleRelease
+#        run: |
-
+#          models\download-ggml-model.cmd tiny.en
-  java:
+#          cd bindings/java
-    needs: [ 'windows' ]
+#          chmod +x ./gradlew
-    runs-on: windows-latest
+#          ./gradlew build
-    steps:
+#
-      - uses: actions/checkout@v4
+#      - name: Upload jar
-
+#        uses: actions/upload-artifact@v4
-      - name: Install Java
+#        with:
-        uses: actions/setup-java@v4
+#          name: whispercpp.jar
-        with:
+#          path: bindings/java/build/libs/whispercpp-*.jar
-          distribution: zulu
+#
-          java-version: 20
+#      - name: Publish package
-
+#        if: ${{ github.ref == 'refs/heads/master' }}
-      - name: Download Windows lib
+#        uses: gradle/gradle-build-action@v2.4.2
-        uses: actions/download-artifact@v4
+#        with:
-        with:
+#          arguments: publish
-          name: win32-x86-64_whisper.dll
+#          build-root-directory: bindings/java
-          path: bindings/java/build/generated/resources/main/win32-x86-64
+#        env:
-
+#          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
-      - name: Build
+#          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
-        run: |
+#          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
-          models\download-ggml-model.cmd tiny.en
+#          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
          cd bindings/java
          chmod +x ./gradlew
          ./gradlew build
      - name: Upload jar
        uses: actions/upload-artifact@v4
        with:
          name: whispercpp.jar
          path: bindings/java/build/libs/whispercpp-*.jar
      - name: Publish package
        if: ${{ github.ref == 'refs/heads/master' }}
        uses: gradle/gradle-build-action@v2.4.2
        with:
          arguments: publish
          build-root-directory: bindings/java
        env:
          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
  quantize:
    runs-on: ubuntu-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@@ -18,7 +18,9 @@ jobs:
      matrix:
        config:
          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
-          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
+          #TODO: the cuda image keeps failing - disable for now
          #      https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
          #- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
    steps:
      - name: Check out the repo
--- a/.gitignore
+++ b/.gitignore
@@ -3,11 +3,13 @@
 .cache/
 .coreml/
 .test/
 .venv/
 .vs/
 .vscode/
 .DS_Store
 .vimspector.json
 /CMakeSettings.json
 /talk-llama.dSYM/
 build/
 build-*/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
 project("whisper.cpp" C CXX)
-project("whisper.cpp" VERSION 1.6.2)
+project("whisper.cpp" VERSION 1.7.1)
 include(CheckIncludeFileCXX)
 set(SOVERSION 1)
@@ -120,7 +120,10 @@ whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
 # build the library
 #
-add_subdirectory(ggml)
+if (NOT TARGET ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 add_subdirectory(src)
 #
@@ -161,18 +164,6 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
 install(
    FILES convert-hf-to-gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
        OWNER_EXECUTE
        GROUP_READ
        GROUP_EXECUTE
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
 configure_file(cmake/whisper.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
        @ONLY)
--- a/104
+++ b/104
@@ -3,12 +3,11 @@ BUILD_TARGETS = \
 	main \
 	bench \
 	quantize \
-	server \
+	server
 	tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-backend-ops
+	tests/test-c.o
 # Deprecation aliases
 ifdef WHISPER_CUBLAS
@@ -135,14 +134,18 @@ ifdef GGML_RPC
 	BUILD_TARGETS += rpc-server
 endif
 ifdef GGML_VULKAN
 	BUILD_TARGETS += vulkan-shaders-gen
 endif
 ifeq ($(shell sdl2-config --cflags --libs 2>/dev/null),)
 else
 	BUILD_TARGETS += \
 		command \
 		stream \
 		lsp \
 		talk \
 		talk-llama
 	# talk (TODO: disalbed)
 endif
 default: $(BUILD_TARGETS)
@@ -251,7 +254,10 @@ ifdef WHISPER_DEBUG
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
-	MK_CPPFLAGS += -DNDEBUG
+	MK_CPPFLAGS   += -DNDEBUG
 	MK_CFLAGS     += -O3
 	MK_CXXFLAGS   += -O3
 	MK_NVCCFLAGS  += -O3
 endif
 ifdef WHISPER_SANITIZE_THREAD
@@ -501,16 +507,15 @@ ifdef GGML_CUDA
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+	#MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+	#MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_NVCCFLAGS += -use_fast_math
 	OBJ_GGML += ggml/src/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)
 	OBJ_WHISPER += src/whisper-mel-cuda.o
 ifdef WHISPER_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # WHISPER_FATAL_WARNINGS
@@ -619,16 +624,12 @@ ggml/src/ggml-cuda.o: \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
 src/whisper-mel-cuda.o: src/whisper-mel-cuda.cu src/whisper-mel-cuda.hpp
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # GGML_CUDA
 ifdef GGML_VULKAN
 	MK_CPPFLAGS += -DGGML_USE_VULKAN
-	MK_LDFLAGS  += -lvulkan
+	MK_LDFLAGS  += $(shell pkg-config --libs vulkan)
-	OBJ_GGML    += ggml/src/ggml-vulkan.o
+	OBJ_GGML    += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
 ifdef GGML_VULKAN_CHECK_RESULTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_CHECK_RESULTS
@@ -642,6 +643,10 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
 	MK_CPPFLAGS  += -DGGML_VULKAN_MEMORY_DEBUG
 endif
 ifdef GGML_VULKAN_PERF
 	MK_CPPFLAGS  += -DGGML_VULKAN_PERF
 endif
 ifdef GGML_VULKAN_VALIDATE
 	MK_CPPFLAGS  += -DGGML_VULKAN_VALIDATE
 endif
@@ -650,10 +655,28 @@ ifdef GGML_VULKAN_RUN_TESTS
 	MK_CPPFLAGS  += -DGGML_VULKAN_RUN_TESTS
 endif
-ggml/src/ggml-vulkan.o: \
+GLSLC_CMD  = glslc
-	ggml/src/ggml-vulkan.cpp \
+_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
-	ggml/include/ggml-vulkan.h
+_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
 _ggml_vk_input_dir = ggml/src/vulkan-shaders
 _ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
 ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
 	$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
 $(_ggml_vk_header): $(_ggml_vk_source)
 $(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
 	$(_ggml_vk_genshaders_cmd) \
 		--glslc      $(GLSLC_CMD) \
 		--input-dir  $(_ggml_vk_input_dir) \
 		--target-hpp $(_ggml_vk_header) \
 		--target-cpp $(_ggml_vk_source)
 vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
 	$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
 endif # GGML_VULKAN
 ifdef GGML_HIPBLAS
@@ -780,7 +803,8 @@ OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o
+	ggml/src/ggml-quants.o \
 	ggml/src/ggml-aarch64.o
 OBJ_WHISPER += \
 	src/whisper.o
@@ -899,10 +923,10 @@ ggml/src/ggml-alloc.o: \
 	$(CC)  $(CFLAGS)   -c $< -o $@
 ggml/src/ggml-backend.o: \
-	ggml/src/ggml-backend.c \
+	ggml/src/ggml-backend.cpp \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h
-	$(CC)  $(CFLAGS)   -c $< -o $@
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 ggml/src/ggml-quants.o: \
 	ggml/src/ggml-quants.c \
@@ -911,6 +935,13 @@ ggml/src/ggml-quants.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml/src/ggml-aarch64.o: \
 	ggml/src/ggml-aarch64.c \
 	ggml/include/ggml.h \
 	ggml/src/ggml-aarch64.h \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
@@ -943,7 +974,6 @@ $(LIB_GGML_S): \
 src/whisper.o: \
 	src/whisper.cpp \
 	src/whisper-mel.hpp \
 	include/whisper.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
@@ -958,7 +988,8 @@ $(LIB_WHISPER): \
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 $(LIB_WHISPER_S): \
-	$(OBJ_WHISPER)
+	$(OBJ_WHISPER) \
 	$(OBJ_GGML)
 	ar rcs $(LIB_WHISPER_S) $^
 # common
@@ -1035,9 +1066,6 @@ main: examples/main/main.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
 bench: examples/bench/bench.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
@@ -1069,12 +1097,14 @@ lsp: examples/lsp/lsp.cpp \
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
+# TODO: disabled until update
-	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
+#       https://github.com/ggerganov/whisper.cpp/issues/1818
-	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
+#talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
-	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
+#	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
 #	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 #	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/llama-vocab.cpp examples/talk-llama/llama-grammar.cpp examples/talk-llama/llama-sampling.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
@@ -1088,11 +1118,6 @@ tests: $(TEST_TARGETS)
 tests/test-c.o: tests/test-c.c include/whisper.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
 tests/test-backend-ops: tests/test-backend-ops.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 #
 # Audio samples
 #
@@ -1138,8 +1163,9 @@ samples:
 .PHONY: large-v1
 .PHONY: large-v2
 .PHONY: large-v3
 .PHONY: large-v3-turbo
-tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo: main
 	bash ./models/download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
--- a/Package.swift
+++ b/Package.swift
@@ -32,8 +32,9 @@ let package = Package(
            sources: [
                "ggml/src/ggml.c",
                "src/whisper.cpp",
                "ggml/src/ggml-aarch64.c",
                "ggml/src/ggml-alloc.c",
-                "ggml/src/ggml-backend.c",
+                "ggml/src/ggml-backend.cpp",
                "ggml/src/ggml-quants.c",
                "ggml/src/ggml-metal.m"
            ],
--- a/README.md
+++ b/README.md
@@ -7,21 +7,23 @@
 [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Stable: [v1.6.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.6.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
 - Plain C/C++ implementation without dependencies
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
+- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](#core-ml-support)
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
+- [4-bit and 5-bit integer quantization support](#quantization)
 - Zero memory allocations at runtime
 - [Vulkan support](#vulkan-gpu-support)
 - Support for CPU-only inference
- [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
+- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
- [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
+- [OpenVINO Support](#openvino-support)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
+- [Ascend NPU Support](#ascend-npu-support)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
 Supported platforms:
@@ -33,9 +35,9 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
+- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
-The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
+The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
 The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
@@ -55,8 +57,8 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 ## Implementation details
- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
+- The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
+- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp))
 - Sample usage is demonstrated in [main.cpp](examples/main)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder
@@ -71,17 +73,23 @@ First clone the repository:
 git clone https://github.com/ggerganov/whisper.cpp.git
 ```
 Navigate into the directory:
 ```
 cd whisper.cpp
 ```
 Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
 ```bash
-bash ./models/download-ggml-model.sh base.en
+sh ./models/download-ggml-model.sh base.en
 ```
 Now build the [main](examples/main) example and transcribe an audio file like this:
 ```bash
 # build the main example
-make
+make -j
 # transcribe an audio file
 ./main -f samples/jfk.wav
@@ -92,7 +100,7 @@ make
 For a quick demo, simply run `make base.en`:
 ```text
-$ make base.en
+$ make -j base.en
 cc  -I.              -O3 -std=c11   -pthread -DGGML_USE_ACCELERATE   -c ggml.c -o ggml.o
 c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
@@ -145,7 +153,7 @@ options:
  -ng,       --no-gpu            [false  ] disable GPU
-bash ./models/download-ggml-model.sh base.en
+sh ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
 ggml-base.en.bin               100%[========================>] 141.11M  6.34MB/s    in 24s
 Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
@@ -216,7 +224,7 @@ ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
 If you want some extra audio samples to play with, simply run:
 ```
-make samples
+make -j samples
 ```
 This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`.
@@ -224,17 +232,18 @@ This will download a few more audio files from Wikipedia and convert them to 16-
 You can download and run the other models as follows:
 ```
-make tiny.en
+make -j tiny.en
-make tiny
+make -j tiny
-make base.en
+make -j base.en
-make base
+make -j base
-make small.en
+make -j small.en
-make small
+make -j small
-make medium.en
+make -j medium.en
-make medium
+make -j medium
-make large-v1
+make -j large-v1
-make large-v2
+make -j large-v2
-make large-v3
+make -j large-v3
 make -j large-v3-turbo
 ```
 ## Memory usage
@@ -256,7 +265,7 @@ Here are the steps for creating and using a quantized model:
 ```bash
 # quantize a model with Q5_0 method
-make quantize
+make -j quantize
 ./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
 # run the examples as usual, specifying the quantized model file
@@ -421,6 +430,16 @@ make clean
 GGML_CUDA=1 make -j
 ```
 ## Vulkan GPU support
 Cross-vendor solution which allows you to accelerate workload on your GPU.
 First, make sure your graphics card driver provides support for Vulkan API.
 Now build `whisper.cpp` with Vulkan support:
 ```
 make clean
 make GGML_VULKAN=1 -j
 ```
 ## BLAS CPU support via OpenBLAS
 Encoder processing can be accelerated on the CPU via OpenBLAS.
@@ -448,6 +467,39 @@ cmake -DWHISPER_MKL=ON ..
 WHISPER_MKL=1 make -j
 ```
 ## Ascend NPU support
 Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores. 
 First, check if your Ascend NPU device is supported:
 **Verified devices**
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |
 Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
 Now build `whisper.cpp` with CANN support:
 ```
 mkdir build
 cd build
 cmake .. -D GGML_CANN=on
 make -j
 ```
 Run the inference examples as usual, for example:
 ```
 ./build/bin/main -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
 ```
 *Notes:*
 - If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
 - If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
 ## Docker
 ### Prerequisites
@@ -584,7 +636,7 @@ The [stream](examples/stream) tool samples the audio every half a second and run
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
 ```bash
-make stream
+make stream -j
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```
@@ -751,7 +803,7 @@ took to execute it. The results are summarized in the following Github issue:
 [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
-Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](bench.py).
+Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
 You can run it with the following command, by default it will run against any standard model in the models folder.
@@ -798,6 +850,7 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
  - [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
 - [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 - [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@@ -14,9 +14,14 @@ GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
-INCLUDE_PATH := $(abspath ../..)
+INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
 LIBRARY_PATH := $(abspath ../..)
 ifeq ($(GGML_CUDA),1)
 	LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
 	BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
 endif
 ifeq ($(UNAME_S),Darwin)
 	EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
 endif
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@@ -62,6 +62,12 @@ This will compile a static `libwhisper.a` in a `build` folder, download a model
 make examples
 ```
 To build using cuda support add `GGML_CUDA=1`:
 ```bash
 GGML_CUDA=1 make examples
 ```
 The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
 ```bash
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@@ -24,7 +24,7 @@ const (
 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"}
 )
 var (
--- a/bindings/go/go.mod
+++ b/bindings/go/go.mod
@@ -1,10 +1,10 @@
 module github.com/ggerganov/whisper.cpp/bindings/go
-go 1.19
+go 1.23
 require (
 	github.com/go-audio/wav v1.1.0
-	github.com/stretchr/testify v1.8.1
+	github.com/stretchr/testify v1.9.0
 )
 require (
--- a/bindings/go/go.sum
+++ b/bindings/go/go.sum
@@ -1,4 +1,3 @@
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@@ -9,15 +8,9 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
 github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
-github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@@ -119,6 +119,28 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }
 func (p *Params) SetMaxContext(n int) {
 	p.n_max_text_ctx = C.int(n)
 }
 func (p *Params) SetBeamSize(n int) {
 	p.beam_search.beam_size = C.int(n)
 }
 func (p *Params) SetEntropyThold(t float32) {
 	p.entropy_thold = C.float(t)
 }
 func (p *Params) SetTemperature(t float32) {
 	p.temperature = C.float(t)
 }
 // Sets the fallback temperature incrementation
 // Pass -1.0 to disable this feature
 func (p *Params) SetTemperatureFallback(t float32) {
 	p.temperature_inc = C.float(t)
 }
 // Set initial prompt
 func (p *Params) SetInitialPrompt(prompt string) {
 	p.initial_prompt = C.CString(prompt)
@@ -149,6 +171,10 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
 	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
 	str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
 	str += fmt.Sprintf(" temperature=%f", p.temperature)
 	str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
 	str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
 	if p.translate {
 		str += " translate"
 	}
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@@ -125,6 +125,32 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }
 // Set maximum number of text context tokens to store
 func (context *context) SetMaxContext(n int) {
 	context.params.SetMaxContext(n)
 }
 // Set Beam Size
 func (context *context) SetBeamSize(n int) {
 	context.params.SetBeamSize(n)
 }
 // Set Entropy threshold
 func (context *context) SetEntropyThold(t float32) {
 	context.params.SetEntropyThold(t)
 }
 // Set Temperature
 func (context *context) SetTemperature(t float32) {
 	context.params.SetTemperature(t)
 }
 // Set the fallback temperature incrementation
 // Pass -1.0 to disable this feature
 func (context *context) SetTemperatureFallback(t float32) {
 	context.params.SetTemperatureFallback(t)
 }
 // Set initial prompt
 func (context *context) SetInitialPrompt(prompt string) {
 	context.params.SetInitialPrompt(prompt)
--- a/bindings/go/pkg/whisper/context_test.go
+++ b/bindings/go/pkg/whisper/context_test.go
@@ -4,52 +4,90 @@ import (
 	"os"
 	"testing"
-	// Packages
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
-	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	"github.com/go-audio/wav"
 	assert "github.com/stretchr/testify/assert"
 )
-const (
+func TestSetLanguage(t *testing.T) {
 	ModelPath  = "../../models/ggml-tiny.bin"
 	SamplePath = "../../samples/jfk.wav"
 )
 func Test_Whisper_000(t *testing.T) {
 	assert := assert.New(t)
 	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
 		t.Skip("Skipping test, model not found:", ModelPath)
 	}
 	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
 		t.Skip("Skipping test, sample not found:", SamplePath)
 	}
 	// Load model
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	assert.NoError(model.Close())
 	t.Log("languages=", model.Languages())
 }
 func Test_Whisper_001(t *testing.T) {
 	assert := assert.New(t)
 	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
 		t.Skip("Skipping test, model not found:", ModelPath)
 	}
 	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
 		t.Skip("Skipping test, sample not found:", SamplePath)
 	}
 	// Load model
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
-	// Get context for decoding
+	context, err := model.NewContext()
 	ctx, err := model.NewContext()
 	assert.NoError(err)
 	assert.NotNil(ctx)
 	// This returns an error since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	err = context.SetLanguage("en")
 	assert.Error(err)
 }
 func TestContextModelIsMultilingual(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	isMultilingual := context.IsMultilingual()
 	// This returns false since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	assert.False(isMultilingual)
 }
 func TestLanguage(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	// This always returns en since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	expectedLanguage := "en"
 	actualLanguage := context.Language()
 	assert.Equal(expectedLanguage, actualLanguage)
 }
 func TestProcess(t *testing.T) {
 	assert := assert.New(t)
 	fh, err := os.Open(SamplePath)
 	assert.NoError(err)
 	defer fh.Close()
 	// Decode the WAV file - load the full buffer
 	dec := wav.NewDecoder(fh)
 	buf, err := dec.FullPCMBuffer()
 	assert.NoError(err)
 	assert.Equal(uint16(1), dec.NumChans)
 	data := buf.AsFloat32Buffer().Data
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	err = context.Process(data, nil, nil)
 	assert.NoError(err)
 }
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@@ -38,17 +38,22 @@ type Context interface {
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language
-	SetOffset(time.Duration)        // Set offset
+	SetOffset(time.Duration)          // Set offset
-	SetDuration(time.Duration)      // Set duration
+	SetDuration(time.Duration)        // Set duration
-	SetThreads(uint)                // Set number of threads to use
+	SetThreads(uint)                  // Set number of threads to use
-	SetSplitOnWord(bool)            // Set split on word flag
+	SetSplitOnWord(bool)              // Set split on word flag
-	SetTokenThreshold(float32)      // Set timestamp token probability threshold
+	SetTokenThreshold(float32)        // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
+	SetTokenSumThreshold(float32)     // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)       // Set max segment length in characters
+	SetMaxSegmentLength(uint)         // Set max segment length in characters
-	SetTokenTimestamps(bool)        // Set token timestamps flag
+	SetTokenTimestamps(bool)          // Set token timestamps flag
-	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
+	SetMaxTokensPerSegment(uint)      // Set max tokens per segment (0 = no limit)
-	SetAudioCtx(uint)               // Set audio encoder context
+	SetAudioCtx(uint)                 // Set audio encoder context
-	SetInitialPrompt(prompt string) // Set initial prompt
+	SetMaxContext(n int)              // Set maximum number of text context tokens to store
 	SetBeamSize(n int)                // Set Beam Size
 	SetEntropyThold(t float32)        // Set Entropy threshold
 	SetInitialPrompt(prompt string)   // Set initial prompt
 	SetTemperature(t float32)         // Set temperature
 	SetTemperatureFallback(t float32) // Set temperature incrementation
 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
--- a/bindings/go/pkg/whisper/model_test.go
+++ b/bindings/go/pkg/whisper/model_test.go
@@ -0,0 +1,91 @@
 package whisper_test
 import (
 	"testing"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	assert "github.com/stretchr/testify/assert"
 )
 func TestNew(t *testing.T) {
 	assert := assert.New(t)
 	t.Run("valid model path", func(t *testing.T) {
 		model, err := whisper.New(ModelPath)
 		assert.NoError(err)
 		assert.NotNil(model)
 		defer model.Close()
 	})
 	t.Run("invalid model path", func(t *testing.T) {
 		invalidModelPath := "invalid-model-path.bin"
 		model, err := whisper.New(invalidModelPath)
 		assert.Error(err)
 		assert.Nil(model)
 	})
 }
 func TestClose(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	err = model.Close()
 	assert.NoError(err)
 }
 func TestNewContext(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	assert.NotNil(context)
 }
 func TestIsMultilingual(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	isMultilingual := model.IsMultilingual()
 	// This returns false since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	assert.False(isMultilingual)
 }
 func TestLanguages(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	expectedLanguages := []string{
 		"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
 		"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
 		"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
 		"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
 		"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
 		"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
 		"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
 		"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
 		"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
 	}
 	actualLanguages := model.Languages()
 	assert.Equal(expectedLanguages, actualLanguages)
 }
--- a/bindings/go/pkg/whisper/util_test.go
+++ b/bindings/go/pkg/whisper/util_test.go
@@ -0,0 +1,6 @@
 package whisper_test
 const (
 	ModelPath  = "../../models/ggml-small.en.bin"
 	SamplePath = "../../samples/jfk.wav"
 )
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@@ -9,7 +9,7 @@ import (
 // CGO
 /*
-#cgo LDFLAGS: -lwhisper -lm -lstdc++
+#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
 #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
 #include <whisper.h>
 #include <stdlib.h>
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.6.2",
+  "version": "1.7.1",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/.gitignore
+++ b/bindings/ruby/.gitignore
@@ -0,0 +1,3 @@
 LICENSE
 pkg/
 lib/whisper.*
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@@ -0,0 +1,111 @@
 whispercpp
 ==========
 ![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
 Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
 Installation
 ------------
 Install the gem and add to the application's Gemfile by executing:
    $ bundle add whispercpp
 If bundler is not being used to manage dependencies, install the gem by executing:
    $ gem install whispercpp
 Usage
 -----
 ```ruby
 require "whisper"
 whisper = Whisper::Context.new("path/to/model.bin")
 params = Whisper::Params.new
 params.language = "en"
 params.offset = 10_000
 params.duration = 60_000
 params.max_text_tokens = 300
 params.translate = true
 params.print_timestamps = false
 params.prompt = "Initial prompt here."
 whisper.transcribe("path/to/audio.wav", params) do |whole_text|
  puts whole_text
 end
 ```
 ### Preparing model ###
 Use script to download model file(s):
 ```bash
 git clone https://github.com/ggerganov/whisper.cpp.git
 cd whisper.cpp
 sh ./models/download-ggml-model.sh base.en
 ```
 There are some types of models. See [models][] page for details.
 ### Preparing audio file ###
 Currently, whisper.cpp accepts only 16-bit WAV files.
 ### API ###
 Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
 ```ruby
 def format_time(time_ms)
  sec, decimal_part = time_ms.divmod(1000)
  min, sec = sec.divmod(60)
  hour, min = min.divmod(60)
  "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
 end
 whisper.transcribe("path/to/audio.wav", params)
 whisper.each_segment.with_index do |segment, index|
  line = "[%{nth}: %{st} --> %{ed}] %{text}" % {
    nth: index + 1,
    st: format_time(segment.start_time),
    ed: format_time(segment.end_time),
    text: segment.text
  }
  line << " (speaker turned)" if segment.speaker_next_turn?
  puts line
 end
 ```
 You can also add hook to params called on new segment:
 ```ruby
 def format_time(time_ms)
  sec, decimal_part = time_ms.divmod(1000)
  min, sec = sec.divmod(60)
  hour, min = min.divmod(60)
  "%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
 end
 # Add hook before calling #transcribe
 params.on_new_segment do |segment|
  line = "[%{st} --> %{ed}] %{text}" % {
    st: format_time(segment.start_time),
    ed: format_time(segment.end_time),
    text: segment.text
  }
  line << " (speaker turned)" if segment.speaker_next_turn?
  puts line
 end
 whisper.transcribe("path/to/audio.wav", params)
 ```
 [whisper.cpp]: https://github.com/ggerganov/whisper.cpp
 [models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
--- a/bindings/ruby/Rakefile
+++ b/bindings/ruby/Rakefile
@@ -1,12 +1,59 @@
 require 'rake/clean'
-  require 'rubygems/package'
+require "bundler/gem_tasks"
 require "pathname"
 require "yaml"
 require "rake/testtask"
-desc 'Build gem'
+extsources = YAML.load_file("extsources.yaml")
-task :package do
+SOURCES = FileList[]
-  spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
+extsources.each do |src|
-  spec = nil
+  basename = src.pathmap("%f")
-  # see: http://gist.github.com/16215
+  dest = basename == "LICENSE" ? basename : basename.pathmap("ext/%f")
-  Thread.new { spec = eval("#{spec_source}") }.join
+  file src
-  spec.validate
+  file dest => src do |t|
-  Gem::Package.build(spec)
+    cp t.source, t.name
  end
  SOURCES.include dest
 end
 CLEAN.include SOURCES
 CLEAN.include FileList[
                "ext/*.o",
                "ext/*.metal",
                "ext/whisper.{so,bundle,dll}",
                "ext/depend"
              ]
 task build: SOURCES + FileList[
                        "ext/extconf.rb",
                        "ext/ruby_whisper.h",
                        "ext/ruby_whisper.cpp",
                        "whispercpp.gemspec",
                      ]
 directory "pkg"
 CLOBBER.include "pkg"
 TEST_MODEL = "../../models/ggml-base.en.bin"
 LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
 LIB_FILE = File.join("lib", LIB_NAME)
 directory "lib"
 task LIB_FILE => SOURCES + ["lib"] do |t|
  Dir.chdir "ext" do
    sh "ruby extconf.rb"
    sh "make"
  end
  mv "ext/#{LIB_NAME}", t.name
 end
 CLEAN.include LIB_FILE
 Rake::TestTask.new do |t|
  t.test_files = FileList["tests/test_*.rb"]
 end
 task test: [TEST_MODEL, LIB_FILE]
 file TEST_MODEL do
  Dir.chdir "../.." do
    sh "./models/download-ggml-model.sh base.en"
  end
 end
--- a/bindings/ruby/ext/.gitignore
+++ b/bindings/ruby/ext/.gitignore
@@ -3,7 +3,33 @@ ggml.c
 ggml.h
 ggml-alloc.c
 ggml-alloc.h
-whisper.bundle
+ggml-aarch64.c
 ggml-aarch64.h
 ggml-backend.cpp
 ggml-backend-impl.h
 ggml-backend.c
 ggml-backend.h
 ggml-common.h
 ggml-cpu-impl.h
 ggml-metal.m
 ggml-metal.metal
 ggml-metal-embed.metal
 ggml-blas.cpp
 ggml-cuda.h
 ggml-impl.h
 ggml-kompute.h
 ggml-metal.h
 ggml-opencl.h
 ggml-quants.c
 ggml-quants.h
 ggml-sycl.h
 ggml-vulkan.h
 ggml-blas.h
 get-flags.mk
 whisper.cpp
 whisper.h
 dr_wav.h
 depend
 whisper.bundle
 whisper.so
 whisper.dll
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@@ -1,20 +1,4 @@
 require 'mkmf'
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
 # need to use c++ compiler flags
 $CXXFLAGS << ' -std=c++11'
@@ -28,4 +12,219 @@ if enable_config('march-tune-native', false)
  $CXXFLAGS << ' -march=native -mtune=native'
 end
-create_makefile('whisper')
+def with_disabling_unsupported_files
  disabled_files = []
  unless $GGML_METAL
    disabled_files << 'ggml-metal.h' << 'ggml-metal.m'
  end
  unless $GGML_METAL_EMBED_LIBRARY
    disabled_files << 'ggml-metal.metal'
  end
  unless $OBJ_ALL&.include? 'ggml-blas.o'
    disabled_files << 'ggml-blas.h' << 'ggml-blas.cpp'
  end
  disabled_files.filter! {|file| File.exist? file}
  disabled_files.each do |file|
    File.rename file, "#{file}.disabled"
  end
  yield
  disabled_files.each do |file|
    File.rename "#{file}.disabled", file
  end
 end
 if ENV['WHISPER_METAL']
  $GGML_METAL ||= true
  $DEPRECATE_WARNING ||= true
 end
 $UNAME_S = `uname -s`.chomp
 $UNAME_P = `uname -p`.chomp
 $UNAME_M = `uname -m`.chomp
 if $UNAME_S == 'Darwin'
  unless ENV['GGML_NO_METAL']
    $GGML_METAL ||= true
  end
  $GGML_NO_OPENMP ||= true
 end
 if $GGML_METAL
  $GGML_METAL_EMBED_LIBRARY = true
 end
 $MK_CPPFLAGS = ''
 $MK_CFLAGS   = '-std=c11   -fPIC'
 $MK_CXXFLAGS = '-std=c++11 -fPIC'
 $MK_NVCCFLAGS = '-std=c++11'
 $MK_LDFLAGS = ''
 $OBJ_GGML = ''
 $OBJ_WHISPER = ''
 $OBJ_COMMON = ''
 $OBJ_SDL = ''
 $MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
 if $UNAME_S == 'Linux'
  $MK_CPPFLAGS << ' -D_GNU_SOURCE'
 end
 if $UNAME_S == 'Darwin'
  $MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
 end
 if ENV['WHISPER_DEBUG']
  $MK_CFLAGS    << ' -O0 -g'
  $MK_CXXFLAGS  << ' -O0 -g'
  $MK_LDFLAGS   << ' -g'
  $MK_NVCCFLAGS << ' -O0 -g'
 else
  $MK_CPPFLAGS   << ' -DNDEBUG'
  $MK_CFLAGS     << ' -O3'
  $MK_CXXFLAGS   << ' -O3'
  $MK_NVCCFLAGS  << ' -O3'
 end
 $WARN_FLAGS =
  ' -Wall' <<
  ' -Wextra' <<
  ' -Wpedantic' <<
  ' -Wcast-qual' <<
  ' -Wno-unused-function'
 $MK_CFLAGS <<
  $WARN_FLAGS <<
  ' -Wshadow' <<
  ' -Wstrict-prototypes' <<
  ' -Wpointer-arith' <<
  ' -Wmissing-prototypes' <<
  ' -Werror=implicit-int' <<
  ' -Werror=implicit-function-declaration'
 $MK_CXXFLAGS <<
  $WARN_FLAGS <<
  ' -Wmissing-declarations' <<
  ' -Wmissing-noreturn'
 unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
  $MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
 end
 if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
  $MK_CFLAGS   << ' -pthread'
  $MK_CXXFLAGS << ' -pthread'
 end
 unless $_WIN32
  $DSO_EXT = '.so'
 else
  $DSO_EXT = '.dll'
 end
 unless ENV['RISCV']
  if %w[x86_64 i686 amd64].include? $UNAME_M
    $HOST_CXXFLAGS ||= ''
    $MK_CFLAGS     << ' -march=native -mtune=native'
    $HOST_CXXFLAGS << ' -march=native -mtune=native'
  end
  if $UNAME_M.match? /aarch64.*/
    $MK_CFLAGS   << ' -mcpu=native'
    $MK_CXXFLAGS << ' -mcpu=native'
  end
 else
  $MK_CFLAGS   << ' -march=rv64gcv -mabi=lp64d'
  $MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
 end
 unless ENV['GGML_NO_ACCELERATE']
  if $UNAME_S == 'Darwin'
    $MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS'
    $MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
    $MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
    $MK_LDFLAGS  << ' -framework Accelerate'
    $OBJ_GGML    << ' ggml-blas.o'
  end
 end
 if ENV['GGML_OPENBLAS']
  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
  $MK_LDFLAGS  << " #{`pkg-config --libs openblas`}"
  $OBJ_GGML    << ' ggml-blas.o'
 end
 if ENV['GGML_OPENBLAS64']
  $MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
  $MK_CFLAGS   << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
  $MK_LDFLAGS  << " #{`pkg-config --libs openblas64`}"
  $OBJ_GGML    << ' ggml-blas.o'
 end
 if $GGML_METAL
  $MK_CPPFLAGS << ' -DGGML_USE_METAL'
  $MK_LDFLAGS  << ' -framework Foundation -framework Metal -framework MetalKit'
  $OBJ_GGML    << ' ggml-metal.o'
  if ENV['GGML_METAL_NDEBUG']
    $MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
  end
  if $GGML_METAL_EMBED_LIBRARY
    $MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
    $OBJ_GGML    << ' ggml-metal-embed.o'
  end
 end
 $OBJ_GGML <<
  ' ggml.o' <<
  ' ggml-alloc.o' <<
  ' ggml-backend.o' <<
  ' ggml-quants.o' <<
  ' ggml-aarch64.o'
 $OBJ_WHISPER <<
  ' whisper.o'
 $OBJ_ALL = "#{$OBJ_GGML} #{$OBJ_WHISPER} #{$OBJ_COMMON} #{$OBJ_SDL}"
 $CPPFLAGS  = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
 $CFLAGS    = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
 $BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
 $CXXFLAGS  = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
 $NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
 $LDFLAGS   = "#{$MK_LDFLAGS} #{$LDFLAGS}"
 if $GGML_METAL_EMBED_LIBRARY
  File.write 'depend', "$(OBJS): $(OBJS) ggml-metal-embed.o\n"
 end
 with_disabling_unsupported_files do
  create_makefile('whisper')
 end
 File.open 'Makefile', 'a' do |file|
  file.puts 'include get-flags.mk'
  if $GGML_METAL
    if $GGML_METAL_EMBED_LIBRARY
      # mkmf determines object files to compile dependent on existing *.{c,cpp,m} files
      # but ggml-metal-embed.c doesn't exist on creating Makefile.
      file.puts "objs := $(OBJS)"
      file.puts "OBJS = $(objs) 'ggml-metal-embed.o'"
      file.puts 'include metal-embed.mk'
    end
  end
 end
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@@ -1,141 +0,0 @@
 #pragma once
 // ggml-backend internal header
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
    //
    // Backend buffer
    //
    // buffer type
    typedef void * ggml_backend_buffer_type_context_t;
    struct ggml_backend_buffer_type_i {
        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
        // check if tensor data is in host memory
        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
    };
    struct ggml_backend_buffer_type {
        struct ggml_backend_buffer_type_i  iface;
        ggml_backend_buffer_type_context_t context;
    };
    // buffer
    typedef void * ggml_backend_buffer_context_t;
    struct ggml_backend_buffer_i {
        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
    };
    struct ggml_backend_buffer {
        struct ggml_backend_buffer_i  iface;
        ggml_backend_buffer_type_t    buft;
        ggml_backend_buffer_context_t context;
        size_t size;
        enum ggml_backend_buffer_usage usage;
    };
    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
                   ggml_backend_buffer_type_t      buft,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);
    // do not use directly, use ggml_backend_tensor_copy instead
    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
    // buffer that contains a collection of buffers
    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    //
    // Backend
    //
    typedef void * ggml_backend_context_t;
    struct ggml_backend_i {
        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
        void (*GGML_CALL free)(ggml_backend_t backend);
        // buffer allocation
        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
        // (optional) asynchronous tensor data access
        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
        // (optional) complete all pending operations
        void (*GGML_CALL synchronize)(ggml_backend_t backend);
        // compute graph with a plan (not used currently)
        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph with a plan
        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph without a plan (async)
        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // check if the backend supports an operation
        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
        // these should be expensive operations with large batch sizes that may benefit from running on this backend
        // even if the weight has to be copied from the CPU temporarily
        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // (optional) event synchronization
        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
    };
    struct ggml_backend {
        ggml_guid_t guid;
        struct ggml_backend_i iface;
        ggml_backend_context_t context;
    };
    struct ggml_backend_event {
        ggml_backend_t backend;
        void * context;
    };
    //
    // Backend registry
    //
    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@@ -1,233 +0,0 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-alloc.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
    typedef struct ggml_backend_event * ggml_backend_event_t;
    typedef struct ggml_backend * ggml_backend_t;
    typedef void * ggml_backend_graph_plan_t;
    //
    // Backend buffer
    //
    // buffer type
    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
    // buffer
    enum ggml_backend_buffer_usage {
        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
    };
    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
    //
    // Backend
    //
    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    // asynchronous copy
    // the copy is performed after all the currently queued operations in backend_src
    // backend_dst will wait for the copy to complete before performing other operations
    // automatic fallback to sync copy if async is not supported
    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
    // events
    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
    //
    // CPU backend
    //
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Create a backend buffer from an existing pointer
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 #ifdef GGML_USE_CPU_HBM
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
    //
    // Backend registry
    //
    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
    GGML_API size_t                     ggml_backend_reg_get_count(void);
    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
    //
    // Backend scheduler
    //
    // The backend scheduler allows for multiple backends to be used together
    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
    // The backends are selected based on:
    // - the backend that supports the operation
    // - the location of the pre-allocated tensors (e.g. the weights)
    /*
      Example usage:
        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
        // initialize buffers from a max size graph (optional)
        reserve_graph = build_graph(sched, max_batch_size);
        // manually assign nodes to a backend (optional, should not be needed in most cases)
        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
        ggml_backend_sched_reserve(sched, reserve_graph);
        // compute
        graph = build_graph(sched);
        ggml_backend_sched_graph_compute(sched, graph);
        // if there are graph inputs:
        ggml_backend_sched_reset(sched);
        ggml_backend_sched_alloc_graph(sched, graph);
        ggml_backend_tensor_set(input_tensor, ...);
        ggml_backend_sched_graph_compute(sched, graph);
    }
    */
    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;
    // when ask == true, the scheduler wants to know if the user wants to observe this node
    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
    //
    // when ask == false, the scheduler is passing the node tensor to the user for observation
    // if the user returns false, the scheduler will cancel the graph compute
    //
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
    // Initialize a backend scheduler
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
    // Get the number of splits of the last graph
    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Allocate and compute graph on the backend scheduler
    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
    // Reset all assignments and allocators - must be called before changing the node backends
    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
    // Set a callback to be called for each resulting node during graph compute
    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
    //
    // Utils
    //
    struct ggml_backend_graph_copy {
        ggml_backend_buffer_t buffer;
        struct ggml_context * ctx_allocated;
        struct ggml_context * ctx_unallocated;
        struct ggml_cgraph * graph;
    };
    // Copy a graph to a different backend
    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
    // Compare the output of two backends
    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
    // Tensor initialization
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-common.h
+++ b/bindings/ruby/ext/ggml-common.h
--- a/bindings/ruby/ext/ggml-cuda.h
+++ b/bindings/ruby/ext/ggml-cuda.h
@@ -1,43 +0,0 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
 #endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_CUDA_MAX_DEVICES       16
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
 GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
 // device buffer
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
 GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
 GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-impl.h
+++ b/bindings/ruby/ext/ggml-impl.h
@@ -1,272 +0,0 @@
 #pragma once
 #include "ggml.h"
 // GGML internal header
 #include <assert.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
 #include <math.h>   // fabsf
 #ifdef __cplusplus
 extern "C" {
 #endif
 // static_assert should be a #define, but if it's not,
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef __cplusplus
 #ifndef static_assert
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
 #else
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 #endif
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__
 #define __FMA__
 #endif
 #ifndef __F16C__
 #define __F16C__
 #endif
 #endif
 // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
 #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __SSE3__
 #define __SSE3__
 #endif
 #ifndef __SSSE3__
 #define __SSSE3__
 #endif
 #endif
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
 #if defined(__ARM_NEON) && !defined(_MSC_VER)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 //
 #include <arm_neon.h>
 typedef __fp16 ggml_fp16_internal_t;
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    ggml_fp16_internal_t tmp;
    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
    return (float)tmp;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    ggml_fp16_t res;
    ggml_fp16_internal_t tmp = f;
    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
    return res;
 }
 #else
 typedef uint16_t ggml_fp16_internal_t;
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
 #ifdef __POWER9_VECTOR__
 #include <altivec.h>
 #undef bool
 #define bool _Bool
 #else
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
 #if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
 #endif
 #endif
 #endif
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
 #endif
 #ifdef __F16C__
 #ifdef _MSC_VER
 #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
 #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
 #else
 #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 #endif
 #elif defined(__POWER9_VECTOR__)
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 /* the inline asm below is about 12% faster than the lookup method */
 #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    register float f;
    register double d;
    __asm__(
        "mtfprd %0,%2\n"
        "xscvhpdp %0,%0\n"
        "frsp %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=f"(f):
        /* in */   "r"(h));
    return f;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    register double d;
    register ggml_fp16_t r;
    __asm__( /* xscvdphp can work on double or single precision */
        "xscvdphp %0,%2\n"
        "mffprd %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=r"(r):
        /* in */   "f"(f));
    return r;
 }
 #else
 // FP16 <-> FP32
 // ref: https://github.com/Maratyszcza/FP16
 static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
        float as_value;
    } fp32;
    fp32.as_bits = w;
    return fp32.as_value;
 }
 static inline uint32_t fp32_to_bits(float f) {
    union {
        float as_value;
        uint32_t as_bits;
    } fp32;
    fp32.as_value = f;
    return fp32.as_bits;
 }
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    const uint32_t w = (uint32_t) h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t two_w = w + w;
    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float exp_scale = 0x1.0p-112f;
 #else
    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
 #endif
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
    const uint32_t magic_mask = UINT32_C(126) << 23;
    const float magic_bias = 0.5f;
    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
    const uint32_t result = sign |
        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
    return fp32_from_bits(result);
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float scale_to_inf = 0x1.0p+112f;
    const float scale_to_zero = 0x1.0p-110f;
 #else
    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
 #endif
    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
    const uint32_t w = fp32_to_bits(f);
    const uint32_t shl1_w = w + w;
    const uint32_t sign = w & UINT32_C(0x80000000);
    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
    if (bias < UINT32_C(0x71000000)) {
        bias = UINT32_C(0x71000000);
    }
    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
    const uint32_t bits = fp32_to_bits(base);
    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #endif // __F16C__
 #endif // __ARM_NEON
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
    return ggml_table_f32_f16[s];
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 #endif
 #if !defined(GGML_FP32_TO_FP16)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 #define GGML_HASHTABLE_FULL ((size_t)-1)
 #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
 struct ggml_hash_set ggml_hash_set_new(size_t size);
 bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // return index, asserts if table is full
 size_t ggml_hash_find_or_insert(      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-kompute.h
+++ b/bindings/ruby/ext/ggml-kompute.h
@@ -1,46 +0,0 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 struct ggml_vk_device {
    int index;
    int type; // same as VkPhysicalDeviceType
    size_t heapSize;
    const char * name;
    const char * vendor;
    int subgroupSize;
    uint64_t bufferAlignment;
    uint64_t maxAlloc;
 };
 struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
 bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
 bool ggml_vk_has_vulkan(void);
 bool ggml_vk_has_device(void);
 struct ggml_vk_device ggml_vk_current_device(void);
 //
 // backend API
 //
 // forward declaration
 typedef struct ggml_backend * ggml_backend_t;
 GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
 GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-metal.h
+++ b/bindings/ruby/ext/ggml-metal.h
@@ -1,66 +0,0 @@
 // An interface allowing to compute ggml_cgraph with Metal
 //
 // This is a fully functional interface that extends ggml with GPU support for Apple devices.
 // A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
 //
 // How it works?
 //
 // As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
 // interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
 // use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
 //
 // You only need to make sure that all memory buffers that you used during the graph creation
 // are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
 // used during the graph evaluation to determine the arguments of the compute kernels.
 //
 // Synchronization between device and host memory (for example for input and output tensors)
 // is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
 //
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #include <stddef.h>
 #include <stdbool.h>
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
 struct ggml_tensor;
 struct ggml_cgraph;
 #ifdef __cplusplus
 extern "C" {
 #endif
 //
 // backend API
 // user-code should use only these functions
 //
 GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
 GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
 GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
 // ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
 GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
 // capture all command buffers committed the next time `ggml_backend_graph_compute` is called
 GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-opencl.h
+++ b/bindings/ruby/ext/ggml-opencl.h
@@ -1,36 +0,0 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 GGML_API void ggml_cl_init(void);
 GGML_API void   ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API bool   ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
 GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API void   ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
 // GGML_API void * ggml_cl_host_malloc(size_t size);
 // GGML_API void   ggml_cl_host_free(void * ptr);
 GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
 GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
 // backend API
 // GGML_API ggml_backend_t ggml_backend_opencl_init(void);
 // GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
 GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
 // GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-quants.c
+++ b/bindings/ruby/ext/ggml-quants.c
--- a/bindings/ruby/ext/ggml-quants.h
+++ b/bindings/ruby/ext/ggml-quants.h
@@ -1,133 +0,0 @@
 #pragma once
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "ggml.h"
 // GGML internal header
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Quantization
 void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
 void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 // Dequantization
 void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 //void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 // Dot product
 void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 #ifdef __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-sycl.h
+++ b/bindings/ruby/ext/ggml-sycl.h
@@ -1,49 +0,0 @@
 //
 //  MIT license
 //  Copyright (C) 2024 Intel Corporation
 //  SPDX-License-Identifier: MIT
 //
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_SYCL_MAX_DEVICES       48
 #define GGML_SYCL_NAME "SYCL"
 // backend API
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 // devide buffer
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 // TODO: these are temporary
 //       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
 GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
 GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-vulkan.h
+++ b/bindings/ruby/ext/ggml-vulkan.h
@@ -1,29 +0,0 @@
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
 GGML_API void ggml_vk_instance_init(void);
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
 GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
 GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
 GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
 GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/metal-embed.mk
+++ b/bindings/ruby/ext/metal-embed.mk
@@ -0,0 +1,14 @@
 ggml-metal-embed.o: \
 	ggml-metal.metal \
 	ggml-common.h
 	@echo "Embedding Metal library"
 	@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
 	$(eval TEMP_ASSEMBLY=$(shell mktemp))
 	@echo ".section __DATA, __ggml_metallib"            >  $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_start"                 >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_start:"                       >> $(TEMP_ASSEMBLY)
 	@echo ".incbin \"ggml-metal-embed.metal\""          >> $(TEMP_ASSEMBLY)
 	@echo ".globl _ggml_metallib_end"                   >> $(TEMP_ASSEMBLY)
 	@echo "_ggml_metallib_end:"                         >> $(TEMP_ASSEMBLY)
 	@$(AS) $(TEMP_ASSEMBLY) -o $@
 	@rm -f ${TEMP_ASSEMBLY}
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@@ -3,6 +3,13 @@
 #include "whisper.h"
 typedef struct {
  VALUE *context;
  VALUE user_data;
  VALUE callback;
  VALUE callbacks;
 } ruby_whisper_callback_container;
 typedef struct {
  struct whisper_context *context;
 } ruby_whisper;
@@ -10,6 +17,9 @@ typedef struct {
 typedef struct {
  struct whisper_full_params params;
  bool diarize;
  ruby_whisper_callback_container *new_segment_callback_container;
  ruby_whisper_callback_container *progress_callback_container;
  ruby_whisper_callback_container *abort_callback_container;
 } ruby_whisper_params;
 #endif
--- a/bindings/ruby/extsources.yaml
+++ b/bindings/ruby/extsources.yaml
@@ -0,0 +1,29 @@
 ---
 - ../../src/whisper.cpp
 - ../../include/whisper.h
 - ../../ggml/src/ggml.c
 - ../../ggml/src/ggml-impl.h
 - ../../ggml/src/ggml-aarch64.h
 - ../../ggml/src/ggml-aarch64.c
 - ../../ggml/src/ggml-alloc.c
 - ../../ggml/src/ggml-backend-impl.h
 - ../../ggml/src/ggml-backend.cpp
 - ../../ggml/src/ggml-common.h
 - ../../ggml/src/ggml-quants.h
 - ../../ggml/src/ggml-quants.c
 - ../../ggml/src/ggml-cpu-impl.h
 - ../../ggml/src/ggml-metal.m
 - ../../ggml/src/ggml-metal.metal
 - ../../ggml/src/ggml-blas.cpp
 - ../../ggml/include/ggml.h
 - ../../ggml/include/ggml-alloc.h
 - ../../ggml/include/ggml-backend.h
 - ../../ggml/include/ggml-cuda.h
 - ../../ggml/include/ggml-kompute.h
 - ../../ggml/include/ggml-metal.h
 - ../../ggml/include/ggml-sycl.h
 - ../../ggml/include/ggml-vulkan.h
 - ../../ggml/include/ggml-blas.h
 - ../../scripts/get-flags.mk
 - ../../examples/dr_wav.h
 - ../../LICENSE
--- a/bindings/ruby/tests/test_callback.rb
+++ b/bindings/ruby/tests/test_callback.rb
@@ -0,0 +1,163 @@
 require "test/unit"
 require "whisper"
 class TestCallback < Test::Unit::TestCase
  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
  def setup
    GC.start
    @params = Whisper::Params.new
    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    @audio = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
  end
  def test_new_segment_callback
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      assert_kind_of Integer, n_new
      assert n_new > 0
      assert_same @whisper, context
      n_segments = context.full_n_segments
      n_new.times do |i|
        i_segment = n_segments - 1 + i
        start_time = context.full_get_segment_t0(i_segment) * 10
        end_time = context.full_get_segment_t1(i_segment) * 10
        text = context.full_get_segment_text(i_segment)
        assert_kind_of Integer, start_time
        assert start_time >= 0
        assert_kind_of Integer, end_time
        assert end_time > 0
        assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
      end
    }
    @whisper.transcribe(@audio, @params)
  end
  def test_new_segment_callback_closure
    search_word = "what"
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      n_segments = context.full_n_segments
      n_new.times do |i|
        i_segment = n_segments - 1 + i
        text = context.full_get_segment_text(i_segment)
        if text.include?(search_word)
          t0 = context.full_get_segment_t0(i_segment)
          t1 = context.full_get_segment_t1(i_segment)
          raise "search word '#{search_word}' found at between #{t0} and #{t1}"
        end
      end
    }
    assert_raise RuntimeError do
      @whisper.transcribe(@audio, @params)
    end
  end
  def test_new_segment_callback_user_data
    udata = Object.new
    @params.new_segment_callback_user_data = udata
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      assert_same udata, user_data
    }
    @whisper.transcribe(@audio, @params)
  end
  def test_new_segment_callback_user_data_gc
    @params.new_segment_callback_user_data = "My user data"
    @params.new_segment_callback = ->(context, state, n_new, user_data) {
      assert_equal "My user data", user_data
    }
    GC.start
    assert_same @whisper, @whisper.transcribe(@audio, @params)
  end
  def test_progress_callback
    first = nil
    last = nil
    @params.progress_callback = ->(context, state, progress, user_data) {
      assert_kind_of Integer, progress
      assert 0 <= progress && progress <= 100
      assert_same @whisper, context
      first = progress if first.nil?
      last = progress
    }
    @whisper.transcribe(@audio, @params)
    assert_equal 0, first
    assert_equal 100, last
  end
  def test_progress_callback_user_data
    udata = Object.new
    @params.progress_callback_user_data = udata
    @params.progress_callback = ->(context, state, n_new, user_data) {
      assert_same udata, user_data
    }
    @whisper.transcribe(@audio, @params)
  end
  def test_on_progress
    first = nil
    last = nil
    @params.on_progress do |progress|
      assert_kind_of Integer, progress
      assert 0 <= progress && progress <= 100
      first = progress if first.nil?
      last = progress
    end
    @whisper.transcribe(@audio, @params)
    assert_equal 0, first
    assert_equal 100, last
  end
  def test_abort_callback
    i = 0
    @params.abort_callback = ->(user_data) {
      assert_nil user_data
      i += 1
      return false
    }
    @whisper.transcribe(@audio, @params)
    assert i > 0
  end
  def test_abort_callback_abort
    i = 0
    @params.abort_callback = ->(user_data) {
      i += 1
      return i == 3
    }
    @whisper.transcribe(@audio, @params)
    assert_equal 3, i
  end
  def test_abort_callback_user_data
    udata = Object.new
    @params.abort_callback_user_data = udata
    yielded = nil
    @params.abort_callback = ->(user_data) {
      yielded = user_data
    }
    @whisper.transcribe(@audio, @params)
    assert_same udata, yielded
  end
  def test_abort_on
    do_abort = false
    aborted_from_callback = false
    @params.on_new_segment do |segment|
      do_abort = true if segment.text.match? /ask/
    end
    i = 0
    @params.abort_on do
      i += 1
      do_abort
    end
    @whisper.transcribe(@audio, @params)
    assert i > 0
  end
 end
--- a/bindings/ruby/tests/test_package.rb
+++ b/bindings/ruby/tests/test_package.rb
@@ -0,0 +1,31 @@
 require 'test/unit'
 require 'tempfile'
 require 'tmpdir'
 require 'shellwords'
 class TestPackage < Test::Unit::TestCase
  def test_build
    Tempfile.create do |file|
      assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
      assert file.size > 0
      assert_path_exist file.to_path
    end
  end
  sub_test_case "Building binary on installation" do
    def setup
      system "rake", "build", exception: true
    end
    def test_install
      match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
      filename = match_data[1]
      version = match_data[2]
      basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
      Dir.mktmpdir do |dir|
        system "gem", "install", "--install-dir", dir.shellescape, "pkg/#{filename.shellescape}", exception: true
        assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", basename)
      end
    end
  end
 end
--- a/bindings/ruby/tests/test_params.rb
+++ b/bindings/ruby/tests/test_params.rb
@@ -0,0 +1,155 @@
 require 'test/unit'
 require 'whisper'
 class TestParams < Test::Unit::TestCase
  def setup
    @params  = Whisper::Params.new
  end
  def test_language
    @params.language = "en"
    assert_equal @params.language, "en"
    @params.language = "auto"
    assert_equal @params.language, "auto"
  end
  def test_offset
    @params.offset = 10_000
    assert_equal @params.offset, 10_000
    @params.offset = 0
    assert_equal @params.offset, 0
  end
  def test_duration
    @params.duration = 60_000
    assert_equal @params.duration, 60_000
    @params.duration = 0
    assert_equal @params.duration, 0
  end
  def test_max_text_tokens
    @params.max_text_tokens = 300
    assert_equal @params.max_text_tokens, 300
    @params.max_text_tokens = 0
    assert_equal @params.max_text_tokens, 0
  end
  def test_translate
    @params.translate = true
    assert @params.translate
    @params.translate = false
    assert !@params.translate
  end
  def test_no_context
    @params.no_context = true
    assert @params.no_context
    @params.no_context = false
    assert !@params.no_context
  end
  def test_single_segment
    @params.single_segment = true
    assert @params.single_segment
    @params.single_segment = false
    assert !@params.single_segment
  end
  def test_print_special
    @params.print_special = true
    assert @params.print_special
    @params.print_special = false
    assert !@params.print_special
  end
  def test_print_progress
    @params.print_progress = true
    assert @params.print_progress
    @params.print_progress = false
    assert !@params.print_progress
  end
  def test_print_realtime
    @params.print_realtime = true
    assert @params.print_realtime
    @params.print_realtime = false
    assert !@params.print_realtime
  end
  def test_print_timestamps
    @params.print_timestamps = true
    assert @params.print_timestamps
    @params.print_timestamps = false
    assert !@params.print_timestamps
  end
  def test_suppress_blank
    @params.suppress_blank = true
    assert @params.suppress_blank
    @params.suppress_blank = false
    assert !@params.suppress_blank
  end
  def test_suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = true
    assert @params.suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = false
    assert !@params.suppress_non_speech_tokens
  end
  def test_token_timestamps
    @params.token_timestamps = true
    assert @params.token_timestamps
    @params.token_timestamps = false
    assert !@params.token_timestamps
  end
  def test_split_on_word
    @params.split_on_word = true
    assert @params.split_on_word
    @params.split_on_word = false
    assert !@params.split_on_word
  end
  def test_initial_prompt
    assert_nil @params.initial_prompt
    @params.initial_prompt = "You are a polite person."
    assert_equal "You are a polite person.", @params.initial_prompt
  end
  def test_temperature
    assert_equal 0.0, @params.temperature
    @params.temperature = 0.5
    assert_equal 0.5, @params.temperature
  end
  def test_max_initial_ts
    assert_equal 1.0, @params.max_initial_ts
    @params.max_initial_ts = 600.0
    assert_equal 600.0, @params.max_initial_ts
  end
  def test_length_penalty
    assert_equal -1.0, @params.length_penalty
    @params.length_penalty = 0.5
    assert_equal 0.5, @params.length_penalty
  end
  def test_temperature_inc
    assert_in_delta 0.2, @params.temperature_inc
    @params.temperature_inc = 0.5
    assert_in_delta 0.5, @params.temperature_inc
  end
  def test_entropy_thold
    assert_in_delta 2.4, @params.entropy_thold
    @params.entropy_thold = 3.0
    assert_in_delta 3.0, @params.entropy_thold
  end
  def test_logprob_thold
    assert_in_delta -1.0, @params.logprob_thold
    @params.logprob_thold = -0.5
    assert_in_delta -0.5, @params.logprob_thold
  end
 end
--- a/bindings/ruby/tests/test_segment.rb
+++ b/bindings/ruby/tests/test_segment.rb
@@ -0,0 +1,87 @@
 require "test/unit"
 require "whisper"
 class TestSegment < Test::Unit::TestCase
  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
  class << self
    attr_reader :whisper
    def startup
      @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
      params = Whisper::Params.new
      params.print_timestamps = false
      jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
      @whisper.transcribe(jfk, params)
    end
  end
  def test_iteration
    whisper.each_segment do |segment|
      assert_instance_of Whisper::Segment, segment
    end
  end
  def test_enumerator
    enum = whisper.each_segment
    assert_instance_of Enumerator, enum
    enum.to_a.each_with_index do |segment, index|
      assert_instance_of Whisper::Segment, segment
      assert_kind_of Integer, index
    end
  end
  def test_start_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal 0, segment.start_time if i == 0
      i += 1
    end
  end
  def test_end_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
      i += 1
    end
  end
  def test_on_new_segment
    params = Whisper::Params.new
    seg = nil
    index = 0
    params.on_new_segment do |segment|
      assert_instance_of Whisper::Segment, segment
      if index == 0
        seg = segment
        assert_equal 0, segment.start_time
        assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
      end
      index += 1
    end
    whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
    assert_equal 0, seg.start_time
    assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
  end
  def test_on_new_segment_twice
    params = Whisper::Params.new
    seg = nil
    params.on_new_segment do |segment|
      seg = segment
      return
    end
    params.on_new_segment do |segment|
      assert_same seg, segment
      return
    end
    whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
  end
  private
  def whisper
    self.class.whisper
  end
 end
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@@ -1,122 +1,13 @@
 TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
 EXTDIR = File.join(TOPDIR, 'ext')
 #$LIBDIR = File.join(TOPDIR, 'lib')
 #$:.unshift(LIBDIR)
 $:.unshift(EXTDIR)
 require 'whisper'
 require 'test/unit'
 class TestWhisper < Test::Unit::TestCase
  TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
  def setup
    @params  = Whisper::Params.new
  end
  def test_language
    @params.language = "en"
    assert_equal @params.language, "en"
    @params.language = "auto"
    assert_equal @params.language, "auto"
  end
  def test_offset
    @params.offset = 10_000
    assert_equal @params.offset, 10_000
    @params.offset = 0
    assert_equal @params.offset, 0
  end
  def test_duration
    @params.duration = 60_000
    assert_equal @params.duration, 60_000
    @params.duration = 0
    assert_equal @params.duration, 0
  end
  def test_max_text_tokens
    @params.max_text_tokens = 300
    assert_equal @params.max_text_tokens, 300
    @params.max_text_tokens = 0
    assert_equal @params.max_text_tokens, 0
  end
  def test_translate
    @params.translate = true
    assert @params.translate
    @params.translate = false
    assert !@params.translate
  end
  def test_no_context
    @params.no_context = true
    assert @params.no_context
    @params.no_context = false
    assert !@params.no_context
  end
  def test_single_segment
    @params.single_segment = true
    assert @params.single_segment
    @params.single_segment = false
    assert !@params.single_segment
  end
  def test_print_special
    @params.print_special = true
    assert @params.print_special
    @params.print_special = false
    assert !@params.print_special
  end
  def test_print_progress
    @params.print_progress = true
    assert @params.print_progress
    @params.print_progress = false
    assert !@params.print_progress
  end
  def test_print_realtime
    @params.print_realtime = true
    assert @params.print_realtime
    @params.print_realtime = false
    assert !@params.print_realtime
  end
  def test_print_timestamps
    @params.print_timestamps = true
    assert @params.print_timestamps
    @params.print_timestamps = false
    assert !@params.print_timestamps
  end
  def test_suppress_blank
    @params.suppress_blank = true
    assert @params.suppress_blank
    @params.suppress_blank = false
    assert !@params.suppress_blank
  end
  def test_suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = true
    assert @params.suppress_non_speech_tokens
    @params.suppress_non_speech_tokens = false
    assert !@params.suppress_non_speech_tokens
  end
  def test_token_timestamps
    @params.token_timestamps = true
    assert @params.token_timestamps
    @params.token_timestamps = false
    assert !@params.token_timestamps
  end
  def test_split_on_word
    @params.split_on_word = true
    assert @params.split_on_word
    @params.split_on_word = false
    assert !@params.split_on_word
  end
  def test_whisper
    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    params  = Whisper::Params.new
@@ -128,4 +19,81 @@ class TestWhisper < Test::Unit::TestCase
    }
  end
  sub_test_case "After transcription" do
    class << self
      attr_reader :whisper
      def startup
        @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
        params = Whisper::Params.new
        params.print_timestamps = false
        jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
        @whisper.transcribe(jfk, params)
      end
    end
    def whisper
      self.class.whisper
    end
    def test_full_n_segments
      assert_equal 1, whisper.full_n_segments
    end
    def test_full_lang_id
      assert_equal 0, whisper.full_lang_id
    end
    def test_full_get_segment_t0
      assert_equal 0, whisper.full_get_segment_t0(0)
      assert_raise IndexError do
        whisper.full_get_segment_t0(whisper.full_n_segments)
      end
      assert_raise IndexError do
        whisper.full_get_segment_t0(-1)
      end
    end
    def test_full_get_segment_t1
      t1 = whisper.full_get_segment_t1(0)
      assert_kind_of Integer, t1
      assert t1 > 0
      assert_raise IndexError do
        whisper.full_get_segment_t1(whisper.full_n_segments)
      end
    end
    def test_full_get_segment_speaker_turn_next
      assert_false whisper.full_get_segment_speaker_turn_next(0)
    end
    def test_full_get_segment_text
      assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
    end
  end
  def test_lang_max_id
    assert_kind_of Integer, Whisper.lang_max_id
  end
  def test_lang_id
    assert_equal 0, Whisper.lang_id("en")
    assert_raise ArgumentError do
      Whisper.lang_id("non existing language")
    end
  end
  def test_lang_str
    assert_equal "en", Whisper.lang_str(0)
    assert_raise IndexError do
      Whisper.lang_str(Whisper.lang_max_id + 1)
    end
  end
  def test_lang_str_full
    assert_equal "english", Whisper.lang_str_full(0)
    assert_raise IndexError do
      Whisper.lang_str_full(Whisper.lang_max_id + 1)
    end
  end
 end
--- a/bindings/ruby/whispercpp.gemspec
+++ b/bindings/ruby/whispercpp.gemspec
@@ -1,3 +1,5 @@
 require "yaml"
 Gem::Specification.new do |s|
  s.name    = "whispercpp"
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
@@ -7,10 +9,16 @@ Gem::Specification.new do |s|
  s.email   = 'todd.fisher@gmail.com'
  s.extra_rdoc_files = ['LICENSE', 'README.md']
-  s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
+  s.files = `git ls-files . -z`.split("\x0") +
              YAML.load_file("extsources.yaml").collect {|file|
                basename = File.basename(file)
                if s.extra_rdoc_files.include?(basename)
                  basename
                else
                  File.join("ext", basename)
                end
              }
  #### Load-time details
  s.require_paths = ['lib','ext']
  s.summary = %q{Ruby whisper.cpp bindings}
  s.test_files = ["tests/test_whisper.rb"]
--- a/cmake/DefaultTargetOptions.cmake
+++ b/cmake/DefaultTargetOptions.cmake
@@ -13,5 +13,5 @@ set_target_properties(${TARGET}
    PROPERTIES
        EXPORT_COMPILE_COMMANDS ON
        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
+        INSTALL_RPATH            "${CMAKE_INSTALL_PREFIX}/lib"
 )
--- a/cmake/whisper-config.cmake.in
+++ b/cmake/whisper-config.cmake.in
@@ -1,7 +1,7 @@
-set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
+set(WHISPER_VERSION      @WHISPER_INSTALL_VERSION@)
-set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
+set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
-set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
+set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
-set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
+set(WHISPER_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
@@ -11,9 +11,9 @@ set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@
-set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
+set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@")
-set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
+set_and_check(WHISPER_LIB_DIR     "@PACKAGE_WHISPER_LIB_INSTALL_DIR@")
-set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
+set_and_check(WHISPER_BIN_DIR     "@PACKAGE_WHISPER_BIN_INSTALL_DIR@")
 # Ensure transient dependencies satisfied
@@ -43,23 +43,23 @@ if (GGML_HIPBLAS)
    find_package(rocblas REQUIRED)
 endif()
-find_library(llama_LIBRARY llama
+find_library(whisper_LIBRARY whisper
    REQUIRED
-    HINTS ${LLAMA_LIB_DIR})
+    HINTS ${WHISPER_LIB_DIR})
-set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
+set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@")
-set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
+set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@")
-add_library(llama UNKNOWN IMPORTED)
+add_library(whisper UNKNOWN IMPORTED)
-set_target_properties(llama
+set_target_properties(whisper
    PROPERTIES
-        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
+    INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
+        INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}"
-        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
+        INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${llama_LIBRARY}"
+        IMPORTED_LOCATION "${whisper_LIBRARY}"
        INTERFACE_COMPILE_FEATURES cxx_std_11
        POSITION_INDEPENDENT_CODE ON )
-check_required_components(Llama)
+check_required_components(whisper)
--- a/cmake/whisper.pc.in
+++ b/cmake/whisper.pc.in
@@ -1,6 +1,6 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=${exec_prefix}/lib
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
 includedir=${prefix}/include
 Name: whisper
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -40,7 +40,7 @@ if (WHISPER_FFMPEG)
    message(STATUS "Found ffmpeg libs:       ${FFMPEG_LIBRARIES}")
    message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
    message(STATUS "ffmpeg definitions:      ${FFMPEG_DEFINITIONS}")
-    message(STATUS "Found avformat ${AVFORMAT_VERSION}")
+    message(STATUS "Found avformat           ${AVFORMAT_VERSION}")
    include_directories(${FFMPEG_INCLUDE_DIRS})
    add_compile_definitions(WHISPER_FFMPEG)
@@ -102,8 +102,8 @@ if (EMSCRIPTEN)
    set_target_properties(libstream PROPERTIES FOLDER "libs")
    add_subdirectory(command.wasm)
    set_target_properties(libcommand PROPERTIES FOLDER "libs")
-    add_subdirectory(talk.wasm)
+    #add_subdirectory(talk.wasm)
-    set_target_properties(libtalk PROPERTIES FOLDER "libs")
+    #set_target_properties(libtalk PROPERTIES FOLDER "libs")
    add_subdirectory(bench.wasm)
    set_target_properties(libbench PROPERTIES FOLDER "libs")
 elseif(CMAKE_JS_VERSION)
@@ -127,8 +127,10 @@ endif (WHISPER_SDL2)
    add_subdirectory(quantize)
    set_target_properties(quantize PROPERTIES FOLDER "examples")
 if (WHISPER_SDL2)
-    add_subdirectory(talk)
+    # TODO: disabled until update
-    set_target_properties(talk PROPERTIES FOLDER "examples")
+    #       https://github.com/ggerganov/whisper.cpp/issues/1818
    #add_subdirectory(talk)
    #set_target_properties(talk PROPERTIES FOLDER "examples")
    add_subdirectory(talk-llama)
    set_target_properties(talk-llama PROPERTIES FOLDER "examples")
    add_subdirectory(lsp)
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -72,6 +72,9 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_MOSTLY_IQ4_XS:
        case GGML_FTYPE_MOSTLY_IQ1_M:
        case GGML_FTYPE_MOSTLY_BF16:
        case GGML_FTYPE_MOSTLY_Q4_0_4_4:
        case GGML_FTYPE_MOSTLY_Q4_0_4_8:
        case GGML_FTYPE_MOSTLY_Q4_0_8_8:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@@ -209,6 +212,11 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_BF16:
                case GGML_TYPE_Q4_0_4_4:
                case GGML_TYPE_Q4_0_4_8:
                case GGML_TYPE_Q4_0_8_8:
                case GGML_TYPE_TQ1_0:
                case GGML_TYPE_TQ2_0:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -147,7 +147,6 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
        default: return "To";
    }
    return "The";
--- a/examples/common.h
+++ b/examples/common.h
@@ -9,6 +9,7 @@
 #include <thread>
 #include <ctime>
 #include <fstream>
 #include <sstream>
 #define COMMON_SAMPLE_RATE 16000
@@ -286,12 +287,43 @@ void sam_print_usage(int argc, char ** argv, const sam_params & params);
 // Terminal utils
 //
 #define SQR(X)    ((X) * (X))
 #define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
-// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
+/**
-// Lowest is red, middle is yellow, highest is green.
+ * Quantizes 24-bit RGB to xterm256 code range [16,256).
 */
 static int rgb2xterm256(int r, int g, int b) {
    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
    int av, ir, ig, ib, il, qr, qg, qb, ql;
    av = r * .299 + g * .587 + b * .114 + .5;
    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
    qr = cube[(ir = UNCUBE(r))];
    qg = cube[(ig = UNCUBE(g))];
    qb = cube[(ib = UNCUBE(b))];
    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
        return ir * 36 + ig * 6 + ib + 020;
    return il + 0350;
 }
 static std::string set_xterm256_foreground(int r, int g, int b) {
    int x = rgb2xterm256(r, g, b);
    std::ostringstream oss;
    oss << "\033[38;5;" << x << "m";
    return oss.str();
 }
 // Lowest is red, middle is yellow, highest is green. Color scheme from
 // Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
 const std::vector<std::string> k_colors = {
-    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
+    set_xterm256_foreground(220,   5,  12),
-    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
+    set_xterm256_foreground(232,  96,  28),
    set_xterm256_foreground(241, 147,  45),
    set_xterm256_foreground(246, 193,  65),
    set_xterm256_foreground(247, 240,  86),
    set_xterm256_foreground(144, 201, 135),
    set_xterm256_foreground( 78, 178, 101),
 };
 //
--- a/examples/dr_wav.h
+++ b/examples/dr_wav.h
--- a/examples/ffmpeg-transcode.cpp
+++ b/examples/ffmpeg-transcode.cpp
@@ -321,7 +321,7 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
        LOG("Couldn't map input file %s\n", ifname.c_str());
        return err;
    }
-    LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
+    LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size);
    struct audio_buffer inaudio_buf;
    inaudio_buf.ptr = ibuf;
    inaudio_buf.size = ibuf_size;
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@@ -48,7 +48,7 @@ if [ -n "$3" ]; then
 fi
 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" "large-v3-turbo" )
 # list available models
 function list_models {
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -997,6 +997,7 @@ int main(int argc, char ** argv) {
        if (params.dtw == "large.v1")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
        if (params.dtw == "large.v2")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
        if (params.dtw == "large.v3")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
        if (params.dtw == "large.v3.turbo")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO;
        if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
            fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
--- a/examples/python/whisper_processor.py
+++ b/examples/python/whisper_processor.py
@@ -21,7 +21,7 @@ def process_audio(wav_file, model_name="base.en"):
    if not os.path.exists(wav_file):
        raise FileNotFoundError(f"WAV file not found: {wav_file}")
-    full_command = f"./main -m {model} -f {wav_file} -np -nt"
+    full_command = f"./main -m {model} -f {wav_file} -nt"
    # Execute the command
    process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -34,6 +34,7 @@ struct server_params
    std::string hostname = "127.0.0.1";
    std::string public_path = "examples/server/public";
    std::string request_path = "";
    std::string inference_path = "/inference";
    int32_t port          = 8080;
    int32_t read_timeout  = 600;
@@ -132,6 +133,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
    fprintf(stderr, "  --request-path PATH,           [%-7s] Request path for all requests\n", sparams.request_path.c_str());
    fprintf(stderr, "  --inference-path PATH,         [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
    fprintf(stderr, "\n");
 }
@@ -182,6 +184,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
        else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@@ -216,7 +219,7 @@ void check_ffmpeg_availibility() {
 bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
    std::ostringstream cmd_stream;
    std::string converted_filename_temp = temp_filename + "_temp.wav";
-    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
    std::string cmd = cmd_stream.str();
    int status = std::system(cmd.c_str());
@@ -644,10 +647,10 @@ int main(int argc, char ** argv) {
        return false;
    });
-    svr.Options(sparams.request_path + "/inference", [&](const Request &, Response &){
+    svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
    });
-    svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
+    svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
        // acquire whisper model mutex lock
        std::lock_guard<std::mutex> lock(whisper_mutex);
@@ -674,7 +677,8 @@ int main(int argc, char ** argv) {
        if (sparams.ffmpeg_converter) {
            // if file is not wav, convert to wav
            // write to temporary file
-            const std::string temp_filename = "whisper_server_temp_file.wav";
+            const std::string temp_filename_base = std::tmpnam(nullptr);
            const std::string temp_filename = temp_filename_base + ".wav";
            std::ofstream temp_file{temp_filename, std::ios::binary};
            temp_file << audio_file.content;
            temp_file.close();
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@@ -1,7 +1,13 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
+    add_executable(${TARGET} talk-llama.cpp
        llama.cpp
        llama-vocab.cpp
        llama-grammar.cpp
        llama-sampling.cpp
        unicode.cpp
        unicode-data.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    if (WHISPER_CLBLAST)
--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
--- a/examples/talk-llama/llama-grammar.h
+++ b/examples/talk-llama/llama-grammar.h
@@ -0,0 +1,144 @@
 #pragma once
 #include "llama-impl.h"
 #include <map>
 struct llama_vocab;
 // grammar element type
 enum llama_gretype {
    // end of rule definition
    LLAMA_GRETYPE_END            = 0,
    // start of alternate definition for rule
    LLAMA_GRETYPE_ALT            = 1,
    // non-terminal element: reference to rule
    LLAMA_GRETYPE_RULE_REF       = 2,
    // terminal element: character (code point)
    LLAMA_GRETYPE_CHAR           = 3,
    // inverse char(s) ([^a], [^a-b] [^abc])
    LLAMA_GRETYPE_CHAR_NOT       = 4,
    // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
    // be an inclusive range ([a-z])
    LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
    // modifies a preceding LLAMA_GRETYPE_CHAR or
    // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
    LLAMA_GRETYPE_CHAR_ALT       = 6,
    // any character (.)
    LLAMA_GRETYPE_CHAR_ANY       = 7,
 };
 typedef struct llama_grammar_element {
    enum llama_gretype type;
    uint32_t           value; // Unicode code point or rule ID
 } llama_grammar_element;
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
 using llama_grammar_rule  = std::vector<      llama_grammar_element>;
 using llama_grammar_stack = std::vector<const llama_grammar_element *>;
 using llama_grammar_rules      = std::vector<llama_grammar_rule>;
 using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
 using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
 const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
 // takes a set of possible pushdown stacks on a grammar, which are required to
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions
 void llama_grammar_accept(
        const llama_grammar_rules  & rules,
        const llama_grammar_stacks & stacks,
                          uint32_t   chr,
              llama_grammar_stacks & stacks_new);
 std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
        const llama_grammar_rules      & rules,
        const llama_grammar_stack      & stack,
        const llama_grammar_candidates & candidates);
 struct llama_grammar_parser {
    std::map<std::string, uint32_t> symbol_ids;
    llama_grammar_rules rules;
    llama_grammar_stack c_rules() const;
    uint32_t get_symbol_id(const char * src, size_t len);
    uint32_t generate_symbol_id(const std::string & base_name);
    void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
    const char * parse_alternates(
            const char        * src,
            const std::string & rule_name,
            uint32_t            rule_id,
            bool                is_nested);
    const char * parse_sequence(
            const char         * src,
            const std::string  & rule_name,
            llama_grammar_rule & rule,
            bool               is_nested);
    const char * parse_rule(const char * src);
    bool parse(const char * src);
    void print(FILE * file);
 };
 struct llama_grammar {
    // note: allow null vocab for testing (not great)
    const llama_vocab * vocab;
    const llama_grammar_rules  rules;  // TODO: shared ptr
          llama_grammar_stacks stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8 partial_utf8;
 };
 //
 // internal API
 //
 // note: needed for tests (not great)
 struct llama_grammar * llama_grammar_init_impl(
        const struct llama_vocab * vocab,
        const llama_grammar_element ** rules,
        size_t n_rules,
        size_t start_rule_index);
 struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
 void llama_grammar_free_impl(struct llama_grammar * grammar);
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
 // TODO: move the API below as member functions of llama_grammar
 void llama_grammar_apply_impl(
        const struct llama_grammar & grammar,
            llama_token_data_array * cur_p);
 void llama_grammar_accept_impl(
              struct llama_grammar & grammar,
                       llama_token   token);
--- a/examples/talk-llama/llama-impl.h
+++ b/examples/talk-llama/llama-impl.h
@@ -0,0 +1,181 @@
 #pragma once
 #include "llama.h"
 #include <string>
 #include <vector>
 #include <stdexcept>
 #ifdef __GNUC__
 #ifdef __MINGW32__
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...)
 #endif
 //
 // logging
 //
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
 void llama_log_internal        (ggml_log_level level, const char * format, ...);
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 #define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 //
 // helpers
 //
 struct time_meas {
    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
    ~time_meas() {
        if (t_start_us >= 0) {
            t_acc += ggml_time_us() - t_start_us;
        }
    }
    const int64_t t_start_us;
    int64_t & t_acc;
 };
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
    size_t last_pos = 0;
    while ((pos = s.find(search, last_pos)) != std::string::npos) {
        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 template<typename T>
 struct ring_buffer {
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
    T & front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    const T & front() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    T & back() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    const T & back() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    void push_back(const T & value) {
        if (capacity == 0) {
            throw std::runtime_error("ring buffer: capacity is zero");
        }
        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
        } else {
            sz++;
        }
        data[pos] = value;
        pos = (pos + 1) % capacity;
    }
    T pop_front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        T value = data[first];
        first = (first + 1) % capacity;
        sz--;
        return value;
    }
    //T & operator[](size_t i) {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    //const T & at(size_t i) const {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    const T & rat(size_t i) const {
        if (i >= sz) {
            throw std::runtime_error("ring buffer: index out of bounds");
        }
        return data[(first + sz - i - 1) % capacity];
    }
    std::vector<T> to_vector() const {
        std::vector<T> result;
        result.reserve(sz);
        for (size_t i = 0; i < sz; i++) {
            result.push_back(data[(first + i) % capacity]);
        }
        return result;
    }
    void clear() {
        // here only reset the status of the buffer
        sz = 0;
        first = 0;
        pos = 0;
    }
    bool empty() const {
        return sz == 0;
    }
    size_t size() const {
        return sz;
    }
    size_t capacity = 0;
    size_t sz = 0;
    size_t first = 0;
    size_t pos = 0;
    std::vector<T> data;
 };
--- a/examples/talk-llama/llama-sampling.cpp
+++ b/examples/talk-llama/llama-sampling.cpp
--- a/examples/talk-llama/llama-sampling.h
+++ b/examples/talk-llama/llama-sampling.h
@@ -0,0 +1,29 @@
 #pragma once
 // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
 #include "llama-grammar.h"
 #include <unordered_map>
 struct llama_vocab;
 struct llama_grammar;
 // sampler chain
 struct llama_sampler_chain {
    llama_sampler_chain_params params;
    std::vector<struct llama_sampler *> samplers;
    // timing
    mutable int64_t t_sample_us;
    mutable int32_t n_sample;
 };
 struct llama_sampler * llama_sampler_init_grammar_impl(
        const struct llama_vocab & vocab,
                      const char * grammar_str,
                      const char * grammar_root);
--- a/examples/talk-llama/llama-vocab.cpp
+++ b/examples/talk-llama/llama-vocab.cpp
--- a/examples/talk-llama/llama-vocab.h
+++ b/examples/talk-llama/llama-vocab.h
@@ -0,0 +1,146 @@
 #pragma once
 #include "llama-impl.h"
 #include <string>
 #include <vector>
 #include <unordered_map>
 #include <map>
 #include <set>
 struct llm_tokenizer;
 struct llama_vocab {
    using id    = llama_token;
    using token = std::string;
    using tattr = llama_token_attr;
    struct token_data {
        token text;
        float score;
        tattr attr;
    };
    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
    int max_token_len = 0; // used for optimizing longest token search
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
    std::vector<id>    cache_special_tokens;
    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
    // default LLaMA special tokens
    id special_bos_id  = 1;
    id special_eos_id  = 2;
    id special_unk_id  = 0;
    id special_sep_id  = -1;
    id special_pad_id  = -1;
    id special_cls_id  = -1;
    id special_mask_id = -1;
    id linefeed_id       = 13;
    id special_prefix_id = -1;
    id special_suffix_id = -1;
    id special_middle_id = -1;
    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
    id special_eom_id    = -1;
    // set of all tokens that cause "end of generation"
    std::set<id> special_eog_ids;
    // tokenizer flags
    bool tokenizer_add_space_prefix           = false;
    bool tokenizer_add_bos                    = false;
    bool tokenizer_add_eos                    = false;
    bool tokenizer_ignore_merges              = false;
    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
    bool tokenizer_remove_extra_whitespaces   = false;
    bool tokenizer_escape_whitespaces         = true;
    bool tokenizer_treat_whitespace_as_suffix = false;
    std::vector<char> precompiled_charsmap;
    llm_tokenizer * tokenizer = nullptr;
    llama_vocab() = default;
    ~llama_vocab();
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
    void init_tokenizer();
 };
 //
 // internal API
 //
 // TODO: rename to llama_tokenize_impl
 // TODO: This should probably be in llama.h
 std::vector<llama_vocab::id> llama_tokenize_internal(
        const llama_vocab & vocab,
        std::string raw_text,
        bool add_special,
        bool parse_special = false);
 // TODO: move the API below as member functions of llama_vocab
 llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
 const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
 float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
 llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
 bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
 llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
 llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
 bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
 bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
 llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
 int32_t llama_tokenize_impl(
        const struct llama_vocab & vocab,
                      const char * text,
                         int32_t   text_len,
                     llama_token * tokens,
                         int32_t   n_tokens_max,
                            bool   add_special,
                            bool   parse_special);
 // does not write null-terminator to buf
 int32_t llama_token_to_piece_impl(
        const struct llama_vocab & vocab,
                     llama_token   token,
                            char * buf,
                         int32_t   length,
                         int32_t   lstrip,
                            bool   special);
 int32_t llama_detokenize_impl(
        const struct llama_vocab & vocab,
               const llama_token * tokens,
                         int32_t   n_tokens,
                            char * text,
                         int32_t   text_len_max,
                            bool   remove_special,
                            bool   unparse_special);
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@@ -33,17 +33,18 @@
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-#define LLAMA_MAX_RNG_STATE (64*1024)
+// TODO: use everywhere in the implementation
 #define LLAMA_TOKEN_NULL -1
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 6
+#define LLAMA_SESSION_VERSION 9
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
-#define LLAMA_STATE_SEQ_VERSION 1
+#define LLAMA_STATE_SEQ_VERSION 2
 #ifdef __cplusplus
 extern "C" {
@@ -55,8 +56,10 @@ extern "C" {
    // TODO: show sample usage
    //
    // struct llama_vocab; // TODO: add in the future
    struct llama_model;
    struct llama_context;
    struct llama_sampler;
    typedef int32_t llama_pos;
    typedef int32_t llama_token;
@@ -67,6 +70,8 @@ extern "C" {
        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
        LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
    };
    // pre-tokenization types
@@ -87,15 +92,23 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
    };
    // note: these values should be synchronized with ggml_rope
    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
    enum llama_rope_type {
        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM =  0,
+        LLAMA_ROPE_TYPE_NORM = 0,
-        LLAMA_ROPE_TYPE_NEOX =  2,
+        LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
        LLAMA_ROPE_TYPE_GLM  =  4,
    };
    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -128,7 +141,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+        // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
@@ -157,6 +170,11 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@@ -175,14 +193,22 @@ extern "C" {
        LLAMA_POOLING_TYPE_MEAN = 1,
        LLAMA_POOLING_TYPE_CLS  = 2,
        LLAMA_POOLING_TYPE_LAST = 3,
        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
    };
    enum llama_attention_type {
        LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
        LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
        LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
    };
    enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
+        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
    };
    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@@ -190,8 +216,10 @@ extern "C" {
    } llama_token_data;
    typedef struct llama_token_data_array {
        // TODO: consider SoA
        llama_token_data * data;
        size_t size;
        int64_t selected; // this is the index in the data array (i.e. not the token id)
        bool sorted;
    } llama_token_data_array;
@@ -252,9 +280,9 @@ extern "C" {
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
+        // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
+        // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_LAYER: ignored
+        // LLAMA_SPLIT_MODE_LAYER: ignored
        int32_t main_gpu;
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@@ -284,16 +312,16 @@ extern "C" {
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggerganov/llama.cpp/pull/7544
    struct llama_context_params {
        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
        uint32_t n_ubatch;          // physical maximum batch size
        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        uint32_t n_threads;         // number of threads to use for generation
+        int32_t  n_threads;         // number of threads to use for generation
-        uint32_t n_threads_batch;   // number of threads to use for batch processing
+        int32_t  n_threads_batch;   // number of threads to use for batch processing
        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
        enum llama_attention_type    attention_type;    // attention type to use for embeddings
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
@@ -311,11 +339,13 @@ extern "C" {
        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
-        // Keep the booleans together to avoid misalignment during copy-by-value.
+        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
        // TODO: move at the end of the struct
        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
@@ -329,7 +359,7 @@ extern "C" {
        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;              // quantize to this llama_ftype
        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // itoken embeddings tensor type
+        enum ggml_type token_embedding_type; // token embeddings tensor type
        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor;         // quantize output.weight
        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@@ -339,56 +369,14 @@ extern "C" {
        void * kv_overrides;                 // pointer to vector containing overrides
    } llama_model_quantize_params;
-    // grammar types
+    typedef struct llama_logit_bias {
-    struct llama_grammar;
+        llama_token token;
        float bias;
    } llama_logit_bias;
-    // grammar element type
+    typedef struct llama_sampler_chain_params {
-    enum llama_gretype {
+        bool no_perf; // whether to measure performance timings
-        // end of rule definition
+    } llama_sampler_chain_params;
        LLAMA_GRETYPE_END            = 0,
        // start of alternate definition for rule
        LLAMA_GRETYPE_ALT            = 1,
        // non-terminal element: reference to rule
        LLAMA_GRETYPE_RULE_REF       = 2,
        // terminal element: character (code point)
        LLAMA_GRETYPE_CHAR           = 3,
        // inverse char(s) ([^a], [^a-b] [^abc])
        LLAMA_GRETYPE_CHAR_NOT       = 4,
        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
        // be an inclusive range ([a-z])
        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
        // modifies a preceding LLAMA_GRETYPE_CHAR or
        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
        LLAMA_GRETYPE_CHAR_ALT       = 6,
        // any character (.)
        LLAMA_GRETYPE_CHAR_ANY       = 7,
    };
    typedef struct llama_grammar_element {
        enum llama_gretype type;
        uint32_t           value; // Unicode code point or rule ID
    } llama_grammar_element;
    // performance timing information
    struct llama_timings {
        double t_start_ms;
        double t_end_ms;
        double t_load_ms;
        double t_sample_ms;
        double t_p_eval_ms;
        double t_eval_ms;
        int32_t n_sample;
        int32_t n_p_eval;
        int32_t n_eval;
    };
    // used in chat template
    typedef struct llama_chat_message {
@@ -396,9 +384,14 @@ extern "C" {
        const char * content;
    } llama_chat_message;
    // lora adapter
    struct llama_lora_adapter;
    // Helpers for getting default parameters
-    LLAMA_API struct llama_model_params llama_model_default_params(void);
+    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
-    LLAMA_API struct llama_context_params llama_context_default_params(void);
+    LLAMA_API struct llama_model_params          llama_model_default_params(void);
    LLAMA_API struct llama_context_params        llama_context_default_params(void);
    LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
    // Initialize the llama + ggml backend
@@ -409,15 +402,23 @@ extern "C" {
    //optional:
    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
    // Optional: an auto threadpool gets created in ggml if not passed explicitly
    LLAMA_API void llama_attach_threadpool(
               struct   llama_context * ctx,
            ggml_threadpool_t   threadpool,
            ggml_threadpool_t   threadpool_batch);
    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);
    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
-            struct llama_model_params     params);
+              struct llama_model_params   params);
    LLAMA_API void llama_free_model(struct llama_model * model);
    // TODO: rename to llama_init_from_model
    LLAMA_API struct llama_context * llama_new_context_with_model(
                     struct llama_model * model,
            struct llama_context_params   params);
@@ -433,22 +434,22 @@ extern "C" {
    LLAMA_API bool llama_supports_mlock      (void);
    LLAMA_API bool llama_supports_gpu_offload(void);
    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_head     (const struct llama_model * model);
    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@@ -482,24 +483,51 @@ extern "C" {
    // Get a llama model tensor
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
    // Returns true if the model contains an encoder that requires llama_encode() call
    LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
    // Returns true if the model contains a decoder that requires llama_decode() call
    LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
    // For encoder-decoder models, this function returns id of the token that must be provided
    // to the decoder to start generating output sequence. For other models, it returns -1.
    LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
    // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
    LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
    // Returns 0 on success
    LLAMA_API uint32_t llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
            const llama_model_quantize_params * params);
-    // Apply a LoRA adapter to a loaded model
+    // Load a LoRA adapter from file
-    // path_base_model is the path to a higher quality model to use as a base for
+    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
-    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
-    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+            struct llama_model * model,
-    // will be applied on top of the previous one
+            const char * path_lora);
-    // Returns 0 on success
+
-    LLAMA_API int32_t llama_model_apply_lora_from_file(
+    // Add a loaded LoRA adapter to given context
-            const struct llama_model * model,
+    // This will not modify model's weight
-                          const char * path_lora,
+    LLAMA_API int32_t llama_lora_adapter_set(
-                               float   scale,
+            struct llama_context * ctx,
-                          const char * path_base_model,
+            struct llama_lora_adapter * adapter,
-                             int32_t   n_threads);
+            float scale);
    // Remove a specific LoRA adapter from given context
    // Return -1 if the adapter is not present in the context
    LLAMA_API int32_t llama_lora_adapter_remove(
            struct llama_context * ctx,
            struct llama_lora_adapter * adapter);
    // Remove all LoRA adapters from given context
    LLAMA_API void llama_lora_adapter_clear(
            struct llama_context * ctx);
    // Manually free a LoRA adapter
    // Note: loaded adapters will be free when the associated model is deleted
    LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
    // the currently loaded vector.
@@ -649,10 +677,11 @@ extern "C" {
    // State / sessions
    //
-    // Returns the maximum size in bytes of the state (rng, logits, embedding
+    // Returns the *actual* size in bytes of the state
-    // and kv_cache) - will often be smaller after compacting tokens
+    // (logits, embedding and kv_cache)
-    LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
+    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
-    LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
+    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
        "use llama_state_get_size instead");
    // Copies the state to the specified destination address.
@@ -660,7 +689,8 @@ extern "C" {
    // Returns the number of bytes copied
    LLAMA_API size_t llama_state_get_data(
            struct llama_context * ctx,
-                         uint8_t * dst);
+                         uint8_t * dst,
                          size_t   size);
    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
            struct llama_context * ctx,
                         uint8_t * dst),
@@ -670,7 +700,8 @@ extern "C" {
    // Returns the number of bytes read
    LLAMA_API size_t llama_state_set_data(
            struct llama_context * ctx,
-                   const uint8_t * src);
+                   const uint8_t * src,
                          size_t   size);
    LLAMA_API DEPRECATED(size_t llama_set_state_data(
            struct llama_context * ctx,
                   const uint8_t * src),
@@ -712,6 +743,7 @@ extern "C" {
    LLAMA_API size_t llama_state_seq_get_data(
            struct llama_context * ctx,
                         uint8_t * dst,
                          size_t   size,
                    llama_seq_id   seq_id);
    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@@ -721,6 +753,7 @@ extern "C" {
    LLAMA_API size_t llama_state_seq_set_data(
            struct llama_context * ctx,
                   const uint8_t * src,
                          size_t   size,
                    llama_seq_id   dest_seq_id);
    LLAMA_API size_t llama_state_seq_save_file(
@@ -767,6 +800,14 @@ extern "C" {
    // Frees a batch of tokens allocated with llama_batch_init()
    LLAMA_API void llama_batch_free(struct llama_batch batch);
    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
    // Stores the encoder output internally for later use by the decoder cross-attention layers.
    //   0 - success
    // < 0 - error
    LLAMA_API int32_t llama_encode(
            struct llama_context * ctx,
              struct llama_batch   batch);
    // Positive return values does not mean a fatal error, but rather a warning.
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@@ -778,13 +819,13 @@ extern "C" {
    // Set the number of threads used for decoding
    // n_threads is the number of threads used for generation (single token)
    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
    // Get the number of threads used for generation of a single token.
-    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
+    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
    // Get the number of threads used for prompt and batch processing (multiple token).
-    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
+    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
    // Set whether the model is in embeddings mode or not
    // If true, embeddings will be returned but logits will not
@@ -832,7 +873,8 @@ extern "C" {
    // Get the embeddings for a sequence id
    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // shape: [n_embd] (1-dimensional)
+    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
    //
@@ -857,12 +899,10 @@ extern "C" {
    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
-    // Returns -1 if unknown, 1 for true or 0 for false.
+    LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
-    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
+    LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
    // Returns -1 if unknown, 1 for true or 0 for false.
    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
    // Codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@@ -873,11 +913,14 @@ extern "C" {
    //
    // Tokenization
    //
    // The API is thread-safe.
    //
    /// @details Convert the provided text into tokens.
    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
    /// @return Returns the number of tokens on success, no more than n_tokens_max
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
    /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
    ///                      as plaintext. Does not insert a leading space.
    LLAMA_API int32_t llama_tokenize(
@@ -892,15 +935,35 @@ extern "C" {
    // Token Id -> Piece.
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
-    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+    // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
    // @param special If true, special tokens are rendered in the output.
    LLAMA_API int32_t llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
                               int32_t   length,
                               int32_t   lstrip,
                                  bool   special);
    /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
    /// @param text The char pointer must be large enough to hold the resulting text.
    /// @return Returns the number of chars/bytes on success, no more than text_len_max.
    /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
    /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
    /// @param unparse_special If true, special tokens are rendered in the output.
    LLAMA_API int32_t llama_detokenize(
        const struct llama_model * model,
               const llama_token * tokens,
                         int32_t   n_tokens,
                            char * text,
                         int32_t   text_len_max,
                            bool   remove_special,
                            bool   unparse_special);
    //
    // Chat templates
    //
    /// Apply chat template. Inspired by hf apply_chat_template() on python.
    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@@ -921,104 +984,114 @@ extern "C" {
                               int32_t   length);
    //
-    // Grammar
+    // Sampling API
    //
    // Sample usage:
    //
    //    // prepare the sampling chain at the start
    //    auto sparams = llama_sampler_chain_default_params();
    //
    //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
    //
    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
    //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
    //
    //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
    //    // this sampler will be responsible to select the actual token
    //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
    //
    //    ...
    //
    //    // decoding loop:
    //    while (...) {
    //        ...
    //
    //        llama_decode(ctx, batch);
    //
    //        // sample from the logits of the last token in the batch
    //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
    //
    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
    //        llama_sampler_accept(smpl, id);
    //        ...
    //    }
    //
    //    llama_sampler_free(smpl);
    //
    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
    // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
    //
-    LLAMA_API struct llama_grammar * llama_grammar_init(
+    typedef void * llama_sampler_context_t;
            const llama_grammar_element ** rules,
                                 size_t    n_rules,
                                 size_t    start_rule_index);
-    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
+    // user code can implement the interface below in order to create custom llama_sampler
    struct llama_sampler_i {
        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
        void                   (*accept)(      struct llama_sampler * smpl, llama_token token);              // can be NULL
        void                   (*apply) (      struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
        void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
+        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
    };
-    //
+    struct llama_sampler {
-    // Sampling functions
+        struct llama_sampler_i  * iface;
-    //
+        llama_sampler_context_t   ctx;
    };
-    // Sets the current rng seed.
+    // mirror of llama_sampler_i:
-    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
+    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
    // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);
-    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+    // llama_sampler_chain
-    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    // a type of llama_sampler that can chain multiple samplers one after another
    LLAMA_API void llama_sample_repetition_penalties(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
               const llama_token * last_tokens,
                          size_t   penalty_last_n,
                           float   penalty_repeat,
                           float   penalty_freq,
                           float   penalty_present);
-    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+    LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
-    /// @param logits Logits extracted from the original generation context.
+
-    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
-    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
-    LLAMA_API void llama_sample_apply_guidance(
+    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
-              struct llama_context * ctx,
+    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
-                             float * logits,
+
-                             float * logits_guidance,
+    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
-                             float   scale);
+    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
    // available samplers:
    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    LLAMA_API void llama_sample_softmax(
+    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
-            struct llama_context * ctx,
+    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
          llama_token_data_array * candidates);
    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_k(
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                         int32_t   k,
                          size_t   min_keep);
    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API void llama_sample_top_p(
+    LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, size_t min_keep);
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   p,
                          size_t   min_keep);
    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API void llama_sample_min_p(
+    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   p,
                          size_t   min_keep);
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API void llama_sample_tail_free(
+    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   z,
                          size_t   min_keep);
    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API void llama_sample_typical(
+    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
-            struct llama_context * ctx,
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
          llama_token_data_array * candidates,
                           float   p,
                          size_t   min_keep);
-    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
+    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API void llama_sample_entropy(
+    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
            struct llama_context * ctx,
          llama_token_data_array * candidates_p,
                           float   min_temp,
                           float   max_temp,
                           float   exponent_val);
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   temp);
    /// @details Apply constraints from grammar
    LLAMA_API void llama_sample_grammar(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
      const struct llama_grammar * grammar);
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@@ -1026,42 +1099,62 @@ extern "C" {
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat(
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
-            struct llama_context * ctx,
+                             int32_t   n_vocab,
-          llama_token_data_array * candidates,
+                            uint32_t   seed,
-                           float   tau,
+                               float   tau,
-                           float   eta,
+                               float   eta,
-                         int32_t   m,
+                             int32_t   m);
                           float * mu);
    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API llama_token llama_sample_token_mirostat_v2(
+    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
-            struct llama_context * ctx,
+                            uint32_t   seed,
-          llama_token_data_array * candidates,
+                               float   tau,
-                           float   tau,
+                               float   eta);
                           float   eta,
                           float * mu);
-    /// @details Selects the token with the highest probability.
+    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
-    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
+            const struct llama_model * model,
-    LLAMA_API llama_token llama_sample_token_greedy(
+                          const char * grammar_str,
-            struct llama_context * ctx,
+                          const char * grammar_root);
          llama_token_data_array * candidates);
-    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
+    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
-    LLAMA_API llama_token llama_sample_token(
+                             int32_t   n_vocab,         // llama_n_vocab()
-            struct llama_context * ctx,
+                         llama_token   special_eos_id,  // llama_token_eos()
-          llama_token_data_array * candidates);
+                         llama_token   linefeed_id,     // llama_token_nl()
                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
                               float   penalty_repeat,  // 1.0 = disabled
                               float   penalty_freq,    // 0.0 = disabled
                               float   penalty_present, // 0.0 = disabled
                                bool   penalize_nl,     // consider newlines as a repeatable token
                                bool   ignore_eos);     // ignore the end-of-sequence token
-    /// @details Accepts the sampled token into the grammar
+    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
-    LLAMA_API void llama_grammar_accept_token(
+                             int32_t   n_vocab,
-            struct llama_context * ctx,
+                             int32_t   n_logit_bias,
-            struct llama_grammar * grammar,
+              const llama_logit_bias * logit_bias);
-                     llama_token   token);
+
    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
    // Shorthand for:
    //    const auto * logits = llama_get_logits_ith(ctx, idx);
    //    llama_token_data_array cur_p = { ... init from logits ... };
    //    llama_sampler_apply(smpl, &cur_p);
    //    auto token = cur_p.data[cur_p.selected].id;
    //    llama_sampler_accept(smpl, token);
    //    return token;
    // Returns the sampled token
    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
    // TODO: extend in the future
    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
    //
    // Model split
@@ -1077,12 +1170,6 @@ extern "C" {
    //  Returns the split_prefix length.
    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
@@ -1090,58 +1177,41 @@ extern "C" {
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
-    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
+    //
    // Performance utils
    //
    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
    //
    struct llama_perf_context_data {
        double t_start_ms;
        double t_load_ms;
        double t_p_eval_ms;
        double t_eval_ms;
        int32_t n_p_eval;
        int32_t n_eval;
    };
    struct llama_perf_sampler_data {
        double t_sample_ms;
        int32_t n_sample;
    };
    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 #ifdef __cplusplus
 }
 #endif
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 #include <random>
 #include <string>
 #include <vector>
 struct ggml_tensor;
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar {
    const std::vector<std::vector<llama_grammar_element>>   rules;
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8                                      partial_utf8;
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );
 void llama_grammar_accept(
        const std::vector<std::vector<llama_grammar_element>>         & rules,
        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
        const uint32_t                                                  chr,
        std::vector<std::vector<const llama_grammar_element *>>       & new_stacks);
 std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
        const std::string & src,
        llama_partial_utf8   partial_start);
 // Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
 // This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
 llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
 #endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@@ -314,7 +314,6 @@ int main(int argc, char ** argv) {
    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
    lcparams.n_threads  = params.n_threads;
    lcparams.flash_attn = params.flash_attn;
@@ -402,6 +401,26 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
    // init sampler
    const float top_k = 5;
    const float top_p = 0.80f;
    const float temp  = 0.30f;
    const int seed = 0;
    auto sparams = llama_sampler_chain_default_params();
    llama_sampler * smpl = llama_sampler_chain_init(sparams);
    if (temp > 0.0f) {
        llama_sampler_chain_add(smpl, llama_sampler_init_top_k(top_k));
        llama_sampler_chain_add(smpl, llama_sampler_init_top_p(top_p, 1));
        llama_sampler_chain_add(smpl, llama_sampler_init_temp (temp));
        llama_sampler_chain_add(smpl, llama_sampler_init_dist (seed));
    } else {
        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
    }
    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
@@ -417,7 +436,7 @@ int main(int argc, char ** argv) {
            session_tokens.resize(llama_n_ctx(ctx_llama));
            size_t n_token_count_out = 0;
-            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+            if (!llama_state_load_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
@@ -700,54 +719,13 @@ int main(int argc, char ** argv) {
                    {
                        // out of user input, sample next token
                        const float top_k          = 5;
                        const float top_p          = 0.80f;
                        const float temp           = 0.30f;
                        const float repeat_penalty = 1.1764f;
                        const int repeat_last_n    = 256;
                        if (!path_session.empty() && need_to_save_session) {
                            need_to_save_session = false;
-                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                            llama_state_save_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
                        }
-                        llama_token id = 0;
+                        const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
                        {
                            auto logits = llama_get_logits(ctx_llama);
                            auto n_vocab = llama_n_vocab(model_llama);
                            logits[llama_token_eos(model_llama)] = 0;
                            std::vector<llama_token_data> candidates;
                            candidates.reserve(n_vocab);
                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                            }
                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
                            // apply repeat penalty
                            const float nl_logit = logits[llama_token_nl(model_llama)];
                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);
                            logits[llama_token_nl(model_llama)] = nl_logit;
                            if (temp <= 0) {
                                // Greedy sampling
                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
                            } else {
                                // Temperature sampling
                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                                llama_sample_temp (ctx_llama, &candidates_p, temp);
                                id = llama_sample_token(ctx_llama, &candidates_p);
                            }
                        }
                        if (id != llama_token_eos(model_llama)) {
                            // add it to the context
@@ -797,8 +775,14 @@ int main(int argc, char ** argv) {
    whisper_print_timings(ctx_wsp);
    whisper_free(ctx_wsp);
-    llama_print_timings(ctx_llama);
+    llama_perf_sampler_print(smpl);
    llama_perf_context_print(ctx_llama);
    llama_sampler_free(smpl);
    llama_batch_free(batch);
    llama_free(ctx_llama);
    llama_backend_free();
    return 0;
 }
--- a/examples/talk-llama/unicode-data.cpp
+++ b/examples/talk-llama/unicode-data.cpp
@@ -7,7 +7,7 @@
 #include <unordered_map>
 #include <unordered_set>
-const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
+const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
 {0x000000, 0x0080},
 {0x000020, 0x0008},
 {0x000021, 0x0020},
@@ -2311,7 +2311,8 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
 0x003000,
 };
-const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
+// list is always in ascending order, to enable binary searh
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
 {0x000043, 0x000063},
@@ -3747,7 +3748,8 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
 {0x01E921, 0x01E943},
 };
-const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
+// list is always in ascending order, to enable binary searh
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
 {0x000063, 0x000043},
@@ -5200,7 +5202,7 @@ const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
 {0x01E943, 0x01E921},
 };
-const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
+const std::initializer_list<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
 {0x000000, 0x000000, 0x000000},
 {0x0000C0, 0x0000C5, 0x000041},
 {0x0000C7, 0x0000C7, 0x000043},
@@ -7030,4 +7032,3 @@ const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
 {0x02FA1C, 0x02FA1C, 0x009F3B},
 {0x02FA1D, 0x02FA1D, 0x02A600},
 };
--- a/examples/talk-llama/unicode-data.h
+++ b/examples/talk-llama/unicode-data.h
@@ -13,8 +13,8 @@ struct range_nfd {
 static const uint32_t MAX_CODEPOINTS = 0x110000;
-extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
 extern const std::unordered_set<uint32_t> unicode_set_whitespace;
-extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
-extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
-extern const std::vector<range_nfd> unicode_ranges_nfd;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@@ -1,6 +1,11 @@
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 #include "unicode.h"
 #include "unicode-data.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -15,6 +20,12 @@
 #include <locale>
 #include <codecvt>
 size_t unicode_len_utf8(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
 }
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
@@ -23,7 +34,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    return result;
 }
-static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];
@@ -112,11 +123,11 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
 static std::vector<codepoint_flags> unicode_cpt_flags_array() {
    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
-    assert (unicode_ranges_flags.front().first == 0);
+    assert (unicode_ranges_flags.begin()[0].first == 0);
-    assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
+    assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
    for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
-        const auto range_ini = unicode_ranges_flags[i-1];  // codepoint_ini, flags
+        const auto range_ini = unicode_ranges_flags.begin()[i-1];  // codepoint_ini, flags
-        const auto range_end = unicode_ranges_flags[i];    // codepoint_end, flags
+        const auto range_end = unicode_ranges_flags.begin()[i];    // codepoint_end, flags
        for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
            cpt_flags[cpt] = range_ini.second;
        }
@@ -232,8 +243,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
@@ -295,9 +305,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+
-            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
+            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
                pos += (cpt == ' ');
-                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
+                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
@@ -351,8 +361,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
+            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
@@ -394,8 +403,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
                }
            }
-            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
+            // regex: [^\r\n\p{L}\p{N}]?\p{L}+
-            if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
+            if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
                if (flags.is_letter || _get_flags(pos+1).is_letter) {  // one or more letters
                    pos++;
                    while (_get_flags(pos).is_letter) {
@@ -421,9 +430,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
-            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
+            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
                pos += (cpt == ' ');
-                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
+                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
                    flags2 = _get_flags(++pos);
                }
                uint32_t cpt2 = _get_cpt(pos);
@@ -588,7 +597,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
    std::vector<uint32_t> result(cpts.size());
    for (size_t i = 0; i < cpts.size(); ++i) {
        const uint32_t cpt = cpts[i];
-        auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
+        auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
        result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
    }
    return result;
@@ -630,8 +639,15 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
 }
 uint32_t unicode_tolower(uint32_t cp) {
-    auto it = unicode_map_lowercase.find(cp);
+    // binary search
-    return it == unicode_map_lowercase.end() ? cp : it->second;
+    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
        [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
            return pair.first < value;
        });
    if (it != unicode_map_lowercase.end() && it->first == cp) {
        return it->second;
    }
    return cp;  // Return the original code point if no lowercase mapping is found
 }
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@@ -4,6 +4,8 @@
 #include <string>
 #include <vector>
 // TODO: prefix all symbols with "llama_"
 struct codepoint_flags {
    enum {
        UNDEFINED       = 0x0001,
@@ -46,8 +48,10 @@ struct codepoint_flags {
    }
 };
 size_t unicode_len_utf8(char src);
 std::string unicode_cpt_to_utf8(uint32_t cp);
 uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@@ -21,7 +21,7 @@ help()
    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
    echo "options:"
    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' 'large-v3-turbo' (default is '$model')."
    echo "-t       Number of threads to use."
    echo "-h       Print this help page."
    echo
--- a/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
@@ -7,8 +7,9 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
 set(SOURCE_FILES
    ${WHISPER_LIB_DIR}/ggml/src/ggml.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
-    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
    ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
    ${WHISPER_LIB_DIR}/src/whisper.cpp
    ${CMAKE_SOURCE_DIR}/jni.c
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@@ -19,8 +19,9 @@ if (NOT GGML_HOME)
        SOURCE_FILES
        ${SOURCE_FILES}
        ${WHISPER_LIB_DIR}/ggml/src/ggml.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
        ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
        )
 endif()
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@@ -7,6 +7,7 @@
 	objects = {
 /* Begin PBXBuildFile section */
 		18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */; };
 		1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
 		1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
 		18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
@@ -21,7 +22,7 @@
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
 		18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
 		18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
-		18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
+		18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
 		18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
 		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
 		7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
@@ -44,6 +45,8 @@
 /* End PBXCopyFilesBuildPhase section */
 /* Begin PBXFileReference section */
 		18133C7E2C64E342005CEAAC /* ggml-aarch64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-aarch64.h"; path = "../../../ggml/src/ggml-aarch64.h"; sourceTree = "<group>"; };
 		18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
 		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
 		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
 		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
@@ -70,7 +73,7 @@
 		18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
 		18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
 		18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
-		18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
+		18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
 		18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
 		18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
 		7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
@@ -112,10 +115,12 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
 				18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
 				18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
 				18A275FF2C2A9563001C8D37 /* ggml-common.h */,
 				18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
 				18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
-				18ABE1572AF556340044A204 /* ggml-backend.c */,
+				18ABE1572AF556340044A204 /* ggml-backend.cpp */,
 				18ABE1552AF556340044A204 /* ggml-backend.h */,
 				18ABE1582AF556340044A204 /* ggml-impl.h */,
 				18ABE1592AF556340044A204 /* ggml-quants.c */,
@@ -236,13 +241,14 @@
 			files = (
 				18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
 				18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
 				18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */,
 				7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
 				18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
 				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
 				1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
-				18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
+				18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
 				1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -50,9 +50,24 @@ else()
    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 if (CMAKE_CROSSCOMPILING)
    set(GGML_NATIVE_DEFAULT OFF)
 else()
    set(GGML_NATIVE_DEFAULT ON)
 endif()
 # defaults
 if (NOT GGML_LLAMAFILE_DEFAULT)
    set(GGML_LLAMAFILE_DEFAULT OFF)
 endif()
 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
 endif()
 # general
 option(GGML_STATIC "ggml: static link libraries"         OFF)
-option(GGML_NATIVE "ggml: enable -march=native flag"     ON)
+option(GGML_NATIVE "ggml: enable -march=native flag"     ${GGML_NATIVE_DEFAULT})
 option(GGML_LTO    "ggml: enable link time optimization" OFF)
 option(GGML_CCACHE "ggml: use ccache if available"       ON)
@@ -70,7 +85,7 @@ option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
 option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
 # instruction set specific
-if (GGML_NATIVE)
+if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
    set(INS_ENB OFF)
 else()
    set(INS_ENB ON)
@@ -104,11 +119,13 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"                            OFF)
+option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
@@ -119,14 +136,16 @@ set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
 option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
@@ -192,13 +211,20 @@ endif ()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-alloc.h
    include/ggml-backend.h
-    "${GGML_HEADERS_CUDA}"
+    include/ggml-blas.h
-    "${GGML_HEADERS_METAL}"
+    include/ggml-cann.h
-    "${GGML_HEADERS_EXTRA}")
+    include/ggml-cuda.h
    include/ggml.h
    include/ggml-kompute.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
    include/ggml-vulkan.h)
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@@ -7,8 +7,8 @@ extern "C" {
 #endif
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct ggml_backend * ggml_backend_t;
+typedef struct             ggml_backend * ggml_backend_t;
 // Tensor allocator
 struct ggml_tallocr {
@@ -24,7 +24,7 @@ GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
 // Graph allocator
 /*
  Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
    ggml_gallocr_reserve(galloc, build_graph(max_batch));
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -12,41 +12,52 @@ extern "C" {
    typedef struct ggml_backend_event * ggml_backend_event_t;
    typedef struct ggml_backend * ggml_backend_t;
    typedef void * ggml_backend_graph_plan_t;
    typedef struct ggml_backend_reg * ggml_backend_reg_t;
    typedef struct ggml_backend_device * ggml_backend_dev_t;
    //
    // Backend buffer type
    //
    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
    //
    // Backend buffer
    //
    // buffer type
    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
    // buffer
    enum ggml_backend_buffer_usage {
        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
    };
-    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
-    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
+    GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    //
-    // Backend
+    // Backend (stream)
    //
    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
@@ -61,8 +72,10 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    // "offset" refers to the offset of the tensor data for setting/getting data
-    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@@ -72,64 +85,118 @@ extern "C" {
    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    // NOTE: will be removed, use device version instead
    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    // asynchronous copy
    // the copy is performed after all the currently queued operations in backend_src
    // backend_dst will wait for the copy to complete before performing other operations
    // automatic fallback to sync copy if async is not supported
    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
-    // events
+    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event);
    //
-    // CPU backend
+    // Events
    //
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+    GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
    GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
    GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
    GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
    GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
-    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
+    //
-    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
+    // Backend device
-    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
+    //
-    // Create a backend buffer from an existing pointer
+    enum ggml_backend_dev_type {
-    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+        GGML_BACKEND_DEVICE_TYPE_CPU,
        GGML_BACKEND_DEVICE_TYPE_GPU,
        // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
        GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
        GGML_BACKEND_DEVICE_TYPE_GPU_FULL
    };
-    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+    // functionality supported by the device
    struct ggml_backend_dev_caps {
        // asynchronous operations
        bool async;
        // pinned host buffer
        bool host_buffer;
        // event synchronization
        bool events;
    };
-#ifdef GGML_USE_CPU_HBM
+    // all the device properties
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+    struct ggml_backend_dev_props {
-#endif
+        const char * name;
        const char * description;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
        struct ggml_backend_dev_caps caps;
    };
    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
    GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
    GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
    GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
    GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
    GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
    GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
    //
    // Backend (reg)
    //
    GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
    GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
    // Functions that may be obtained using ggml_backend_reg_get_proc_address
    typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
    //
    // Backend registry
    //
-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+    // Backend (reg) enumeration
    GGML_API size_t             ggml_backend_reg_count(void);
    GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
    GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
-    GGML_API size_t                     ggml_backend_reg_get_count(void);
+    // Device enumeration
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
+    GGML_API size_t             ggml_backend_dev_count(void);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
+    GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+    GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
+
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
+    // Direct backend (stream) initialization
    // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
    GGML_API ggml_backend_t ggml_backend_init_best(void);
    //
    // Backend scheduler
    //
-    // The backend scheduler allows for multiple backends to be used together
+    // The backend scheduler allows for multiple backend devices to be used together
    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
    // The backends are selected based on:
    // - the backend that supports the operation
@@ -164,9 +231,9 @@ extern "C" {
    }
    */
    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;
    // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
    // when ask == true, the scheduler wants to know if the user wants to observe this node
    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
    //
@@ -180,7 +247,7 @@ extern "C" {
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
@@ -195,7 +262,7 @@ extern "C" {
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
@@ -221,7 +288,7 @@ extern "C" {
    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
    // Compare the output of two backends
    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
@@ -230,6 +297,26 @@ extern "C" {
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
    //
    // CPU backend
    //
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Create a backend buffer from an existing pointer
    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 #ifdef GGML_USE_CPU_HBM
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
@@ -9,13 +9,13 @@ extern "C" {
 #endif
 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
+GGML_API ggml_backend_t ggml_backend_blas_init(void);
-GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
 // number of threads used for conversion to float
 // for openblas and blis, this will also set the number of threads used for blas operations
-GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 #ifdef  __cplusplus
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@@ -0,0 +1,121 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #pragma once
 #include "ggml-backend.h"
 #include "ggml.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * @brief Maximum number of CANN devices supported.
 */
 #define GGML_CANN_MAX_DEVICES 16
 /**
 * @brief Initializes the CANN backend for a specified device.
 *
 * This function initializes the CANN backend for the given device.
 * It verifies the device index, allocates a context, and creates a backend
 * instance.
 *
 * @param device The index of the device to initialize.
 * @return A pointer to the initialized backend instance, or nullptr on failure.
 */
 GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
 /**
 * @brief Checks if a given backend is a CANN backend.
 *
 * This function verifies if the provided backend is a CANN backend by comparing
 * its GUID with the CANN backend's GUID.
 *
 * @param backend The backend instance to check.
 * @return True if the backend is a CANN backend, false otherwise.
 */
 GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
 /**
 * @brief Retrieves the CANN buffer type for a specified device.
 *
 * This function initializes and returns the buffer type interface associated
 * with the given device. It ensures thread-safe access using a mutex.
 *
 * @param device The device index for which to retrieve the buffer type.
 * @return A pointer to the buffer type interface for the specified device, or
 * nullptr if the device index is out of range.
 */
 GGML_API ggml_backend_buffer_type_t
 ggml_backend_cann_buffer_type(int32_t device);
 /**
 * @brief Retrieves the number of CANN devices available.
 *
 * This function returns the number of CANN devices available based on
 * information obtained from `ggml_cann_info()`.
 *
 * @return The number of CANN devices available.
 */
 GGML_API int32_t ggml_backend_cann_get_device_count(void);
 /**
 * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
 *
 * @return A pointer to the host buffer type interface.
 */
 GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
 * This function sets the specified device, retrieves the SoC name,
 * and writes it into the provided description buffer.
 *
 * @param device The device index to retrieve the description for.
 * @param description Pointer to a buffer where the description will be written.
 * @param description_size Size of the description buffer.
 */
 GGML_API void ggml_backend_cann_get_device_description(
    int32_t device, char* description, size_t description_size);
 /**
 * @brief Retrieves the memory information of a specific CANN device.
 *
 * This function sets the specified device, retrieves the free and total
 * memory information of the specified type (ACL_HBM_MEM), and stores them
 * in the provided pointers.
 *
 * @param device The device index to retrieve memory information for.
 * @param free Pointer to a variable where the free memory size will be stored.
 * @param total Pointer to a variable where the total memory size will be
 * stored.
 */
 GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
                                                  size_t* free,
                                                  size_t* total);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@@ -3,42 +3,45 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
 #endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #elif defined(GGML_USE_MUSA)
 #define GGML_CUDA_NAME "MUSA"
 #define GGML_CUBLAS_NAME "muBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
 #endif
 #define GGML_CUDA_MAX_DEVICES       16
 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
-GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
 // device buffer
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
+GGML_API int  ggml_backend_cuda_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
 GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -1,3 +1,5 @@
 // Note: this description is outdated
 //
 // An interface allowing to compute ggml_cgraph with Metal
 //
 // This is a fully functional interface that extends ggml with GPU support for Apple devices.
@@ -25,9 +27,6 @@
 #include <stddef.h>
 #include <stdbool.h>
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
 struct ggml_tensor;
 struct ggml_cgraph;
@@ -40,17 +39,15 @@ extern "C" {
 // user-code should use only these functions
 //
 GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
-GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
-GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
@@ -63,4 +60,3 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -10,14 +10,14 @@ extern "C" {
 #define GGML_RPC_MAX_SERVERS       16
 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
-GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
-GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
-GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
+GGML_API void ggml_backend_sycl_print_sycl_devices(void);
-GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
+GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
-GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
+GGML_API int  ggml_backend_sycl_get_device_count();
-GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 // SYCL doesn't support registering host memory, keep here for reference
-// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@@ -13,16 +13,16 @@ extern "C" {
 GGML_API void ggml_vk_instance_init(void);
 // backend API
-GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
-GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
+GGML_API int  ggml_backend_vk_get_device_count(void);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -187,16 +187,6 @@
 #    define GGML_API
 #endif
 #ifdef GGML_MULTIPLATFORM
 #    if defined(_WIN32)
 #        define GGML_CALL
 #    else
 #        define GGML_CALL __attribute__((__ms_abi__))
 #    endif
 #else
 #    define GGML_CALL
 #endif
 // TODO: support for clang
 #ifdef __GNUC__
 #    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@@ -220,21 +210,24 @@
 #include <stdio.h>
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
+#define GGML_FILE_VERSION 2
 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
 #define GGML_MAX_DIMS           4
 #define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
-#ifndef GGML_MAX_NAME
+#define GGML_MAX_N_THREADS      512
 #define GGML_MAX_NAME           64
 #endif
 #define GGML_MAX_OP_PARAMS      64
 #ifndef GGML_MAX_NAME
 #   define GGML_MAX_NAME        64
 #endif
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
 #if UINTPTR_MAX == 0xFFFFFFFF
    #define GGML_MEM_ALIGN 4
 #else
@@ -244,6 +237,8 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 #define GGML_ROPE_TYPE_NEOX 2
 #define GGUF_MAGIC "GGUF"
 #define GGUF_VERSION 3
@@ -254,26 +249,27 @@
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
            fflush(stdout); \
            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
            ggml_print_backtrace(); \
            abort(); \
        } \
    } while (0)
 #ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#   define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
 #elif defined(__GNUC__)
-#define GGML_UNREACHABLE() __builtin_unreachable()
+#   define GGML_UNREACHABLE() __builtin_unreachable()
 #elif defined(_MSC_VER)
-#define GGML_UNREACHABLE() __assume(0)
+#   define GGML_UNREACHABLE() __assume(0)
 #else
-#define GGML_UNREACHABLE() ((void) 0)
+#   define GGML_UNREACHABLE() ((void) 0)
 #endif
 #ifdef __cplusplus
 #   define GGML_NORETURN [[noreturn]]
 #elif defined(_MSC_VER)
 #   define GGML_NORETURN __declspec(noreturn)
 #else
 #   define GGML_NORETURN _Noreturn
 #endif
 #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
 #define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -322,6 +318,9 @@
 extern "C" {
 #endif
    GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
    GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
    enum ggml_status {
        GGML_STATUS_ALLOC_FAILED = -2,
        GGML_STATUS_FAILED = -1,
@@ -330,7 +329,7 @@ extern "C" {
    };
    // get ggml_status name string
-    GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
+    GGML_API const char * ggml_status_to_string(enum ggml_status status);
    // ieee 754-2008 half-precision float16
    // todo: make this not an integral type
@@ -345,10 +344,12 @@ extern "C" {
    GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
    GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
    GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
    GGML_API void        ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
    GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
    struct ggml_object;
    struct ggml_context;
    struct ggml_cgraph;
    // NOTE: always add types at the end of the enum to keep backward compatibility
    enum ggml_type {
@@ -383,6 +384,11 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
        GGML_TYPE_Q4_0_4_4 = 31,
        GGML_TYPE_Q4_0_4_8 = 32,
        GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_TQ1_0   = 34,
        GGML_TYPE_TQ2_0   = 35,
        GGML_TYPE_COUNT,
    };
@@ -424,6 +430,9 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };
    // available tensor operations:
@@ -440,10 +449,13 @@ extern "C" {
        GGML_OP_SQR,
        GGML_OP_SQRT,
        GGML_OP_LOG,
        GGML_OP_SIN,
        GGML_OP_COS,
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
        GGML_OP_ARGMAX,
        GGML_OP_COUNT_EQUAL,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
        GGML_OP_CONCAT,
@@ -477,9 +489,11 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_POOL_2D_BACK,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARANGE,
@@ -495,6 +509,7 @@ extern "C" {
        GGML_OP_WIN_UNPART,
        GGML_OP_GET_REL_POS,
        GGML_OP_ADD_REL_POS,
        GGML_OP_RWKV_WKV,
        GGML_OP_UNARY,
@@ -511,6 +526,7 @@ extern "C" {
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_OPT_STEP_ADAMW,
        GGML_OP_COUNT,
    };
@@ -529,6 +545,7 @@ extern "C" {
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
        GGML_UNARY_OP_COUNT,
    };
@@ -540,35 +557,25 @@ extern "C" {
    };
    enum ggml_log_level {
-        GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_NONE  = 0,
-        GGML_LOG_LEVEL_WARN  = 3,
+        GGML_LOG_LEVEL_INFO  = 1,
-        GGML_LOG_LEVEL_INFO  = 4,
+        GGML_LOG_LEVEL_WARN  = 2,
-        GGML_LOG_LEVEL_DEBUG = 5
+        GGML_LOG_LEVEL_ERROR = 3,
        GGML_LOG_LEVEL_DEBUG = 4,
        GGML_LOG_LEVEL_CONT  = 5, // continue previous log
    };
    // this tensor...
    enum ggml_tensor_flag {
-        GGML_TENSOR_FLAG_INPUT  = 1,
+        GGML_TENSOR_FLAG_INPUT  =  1, // ...is an input for the GGML compute graph
-        GGML_TENSOR_FLAG_OUTPUT = 2,
+        GGML_TENSOR_FLAG_OUTPUT =  2, // ...is an output for the GGML compute graph
-        GGML_TENSOR_FLAG_PARAM  = 4,
+        GGML_TENSOR_FLAG_PARAM  =  4, // ...contains trainable parameters
        GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
    };
    // ggml object
    struct ggml_object {
        size_t offs;
        size_t size;
        struct ggml_object * next;
        enum ggml_object_type type;
        char padding[4];
    };
    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type         type;
+        enum ggml_type type;
        GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
@@ -611,6 +618,29 @@ extern "C" {
    // If it returns true, the computation is aborted
    typedef bool (*ggml_abort_callback)(void * data);
    // Scheduling priorities
    enum ggml_sched_priority {
        GGML_SCHED_PRIO_NORMAL,
        GGML_SCHED_PRIO_MEDIUM,
        GGML_SCHED_PRIO_HIGH,
        GGML_SCHED_PRIO_REALTIME
    };
    // Threadpool params
    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
    struct ggml_threadpool_params {
        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
        int                 n_threads;                   // number of threads
        enum ggml_sched_priority prio;                   // thread priority
        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
        bool                strict_cpu;                  // strict cpu placement
        bool                paused;                      // start in paused state
    };
    struct ggml_threadpool;     // forward declaration, see ggml.c
    typedef struct ggml_threadpool * ggml_threadpool_t;
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@@ -618,39 +648,15 @@ extern "C" {
        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
        int n_threads;
        struct ggml_threadpool * threadpool;
        // abort ggml_graph_compute when true
        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
    };
    enum ggml_cgraph_eval_order {
        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
        GGML_CGRAPH_EVAL_ORDER_COUNT
    };
    struct ggml_hash_set {
        size_t size;
        struct ggml_tensor ** keys;
    };
    // computation graph
    struct ggml_cgraph {
        int size;
        int n_nodes;
        int n_leafs;
        struct ggml_tensor ** nodes;
        struct ggml_tensor ** grads;
        struct ggml_tensor ** leafs;
        struct ggml_hash_set visited_hash_table;
        enum ggml_cgraph_eval_order order;
    };
    // scratch buffer
    // TODO: deprecate and remove
    struct ggml_scratch {
        size_t offs;
        size_t size;
@@ -692,8 +698,6 @@ extern "C" {
    GGML_API int64_t ggml_cycles(void);
    GGML_API int64_t ggml_cycles_per_ms(void);
    GGML_API void    ggml_print_backtrace(void);
    // accepts a UTF-8 path, even on Windows
    GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
@@ -703,50 +707,52 @@ extern "C" {
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-    GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows     (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes    (const struct ggml_tensor * tensor);
-    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API size_t  ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
+    GGML_API int64_t ggml_blck_size(enum ggml_type type);
-    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API size_t  ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API size_t  ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
    GGML_DEPRECATED(
    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
    "use ggml_row_size() instead");
-    GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_type_name(enum ggml_type type);
-    GGML_API GGML_CALL const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API           const char * ggml_op_symbol(enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
-    GGML_API           const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
-    GGML_API GGML_CALL size_t  ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool    ggml_is_quantized(enum ggml_type type);
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);
    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
-    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool ggml_is_empty     (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_empty     (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
-    GGML_API GGML_CALL bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
+    GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
-    GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
+    GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
-    GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
+    GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
@@ -754,8 +760,9 @@ extern "C" {
    // main
-    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
-    GGML_API void                  ggml_free(struct ggml_context * ctx);
+    GGML_API void                  ggml_reset(struct ggml_context * ctx);
    GGML_API void                  ggml_free (struct ggml_context * ctx);
    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
@@ -832,7 +839,7 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
-    GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
@@ -953,6 +960,22 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return scalar
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
@@ -973,6 +996,12 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // count number of equal elements in a and b
    GGML_API struct ggml_tensor * ggml_count_equal(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // if a is the same shape as b, and a is not parameter, return a
    // otherwise, return a new tensor: repeat(a) to fit in b
    GGML_API struct ggml_tensor * ggml_repeat(
@@ -1103,6 +1132,14 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_exp(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_exp_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@@ -1126,16 +1163,17 @@ extern "C" {
    // group normalize along ne0*ne1*n_groups
    // used in stable-diffusion
    // TODO: eps is hardcoded to 1e-6 for now
    GGML_API struct ggml_tensor * ggml_group_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
            float                 eps);
    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
            float                 eps);
    // a - x
    // b - dy
@@ -1197,7 +1235,7 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_inplace(
@@ -1207,19 +1245,19 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
    GGML_API struct ggml_tensor * ggml_set_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set_2d(
@@ -1227,7 +1265,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@@ -1235,7 +1273,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
@@ -1370,14 +1408,14 @@ extern "C" {
    // supports 3D: a->ne[2] == b->ne[1]
    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // data
-            struct ggml_tensor  * b);
+            struct ggml_tensor  * b); // row indices
    GGML_API struct ggml_tensor * ggml_get_rows_back(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // gradients of ggml_get_rows result
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,  // row indices
-            struct ggml_tensor  * c);
+            struct ggml_tensor  * c); // data for ggml_get_rows, only used for its shape
    GGML_API struct ggml_tensor * ggml_diag(
        struct ggml_context     * ctx,
@@ -1438,11 +1476,10 @@ extern "C" {
            struct ggml_tensor  * b);
    // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
+    // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
-    // if mode & 2 == 1, GPT-NeoX style
+    // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
    //
    // b is an int32 vector with size a->ne[2], it contains the positions
    // c is freq factors (e.g. phi3-128k), (optional)
    GGML_API struct ggml_tensor * ggml_rope(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1459,6 +1496,7 @@ extern "C" {
            int                   mode);
    // custom RoPE
    // c is freq factors (e.g. phi3-128k), (optional)
    GGML_API struct ggml_tensor * ggml_rope_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@@ -1521,16 +1559,16 @@ extern "C" {
        "use ggml_rope_ext_inplace instead");
    // compute correction dims for YaRN RoPE scaling
-    GGML_CALL void ggml_rope_yarn_corr_dims(
+    void ggml_rope_yarn_corr_dims(
        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a, // gradients of ggml_rope result
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b, // positions
-            struct ggml_tensor  * c,
+            struct ggml_tensor  * c, // freq factors
            int                   n_dims,
            int                   mode,
            int                   n_ctx_orig,
@@ -1549,34 +1587,49 @@ extern "C" {
            float                 min,
            float                 max);
    // im2col
    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
    GGML_API struct ggml_tensor * ggml_im2col(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,  // data
-            int                  s0,
+            int                   s0, // stride dimension 0
-            int                  s1,
+            int                   s1, // stride dimension 1
-            int                  p0,
+            int                   p0, // padding dimension 0
-            int                  p1,
+            int                   p1, // padding dimension 1
-            int                  d0,
+            int                   d0, // dilation dimension 0
-            int                  d1,
+            int                   d1, // dilation dimension 1
-            bool                 is_2D,
+            bool                  is_2D,
-            enum ggml_type       dst_type);
+            enum ggml_type        dst_type);
    GGML_API struct ggml_tensor * ggml_im2col_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,  // convolution kernel
        struct ggml_tensor  * b,  // gradient of im2col output
        int64_t             * ne, // shape of im2col input
        int                   s0, // stride dimension 0
        int                   s1, // stride dimension 1
        int                   p0, // padding dimension 0
        int                   p1, // padding dimension 1
        int                   d0, // dilation dimension 0
        int                   d1, // dilation dimension 1
        bool                  is_2D);
    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,  // data
-            int                  s0,
+            int                  s0,  // stride dimension 0
-            int                  s1,
+            int                  s1,  // stride dimension 1
-            int                  p0,
+            int                  p0,  // padding dimension 0
-            int                  p1,
+            int                  p1,  // padding dimension 1
-            int                  d0,
+            int                  d0,  // dilation dimension 0
-            int                  d1);
+            int                  d1); // dilation dimension 1
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,   // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,   // data
            int                   s0,  // stride
            int                   p0,  // padding
            int                   d0); // dilation
@@ -1585,29 +1638,29 @@ extern "C" {
    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,  // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,  // data
-            int                   s,
+            int                   s,  // stride
-            int                   d);
+            int                   d); // dilation
    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,   // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,   // data
-            int                   s0,
+            int                   s0,  // stride
-            int                   p0,
+            int                   p0,  // padding
-            int                   d0);
+            int                   d0); // dilation
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
+            struct ggml_tensor  * a,   // convolution kernel
-            struct ggml_tensor  * b,
+            struct ggml_tensor  * b,   // data
-            int                   s0,
+            int                   s0,  // stride dimension 0
-            int                   s1,
+            int                   s1,  // stride dimension 1
-            int                   p0,
+            int                   p0,  // padding dimension 0
-            int                   p1,
+            int                   p1,  // padding dimension 1
-            int                   d0,
+            int                   d0,  // dilation dimension 0
-            int                   d1);
+            int                   d1); // dilation dimension 1
    // kernel size is a->ne[0] x a->ne[1]
@@ -1669,6 +1722,18 @@ extern "C" {
            float                 p0,
            float                 p1);
    GGML_API struct ggml_tensor * ggml_pool_2d_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * af, // "a"/input used in forward pass
            enum ggml_op_pool     op,
            int                   k0,
            int                   k1,
            int                   s0,
            int                   s1,
            float                 p0,
            float                 p1);
    // nearest interpolate
    // multiplies ne0 and ne1 by scale factor
    // used in stable-diffusion
@@ -1743,7 +1808,8 @@ extern "C" {
            struct ggml_tensor  * v,
            struct ggml_tensor  * mask,
            float                 scale,
-            float                 max_bias);
+            float                 max_bias,
            float                 logit_softcap);
    GGML_API void ggml_flash_attn_ext_set_prec(
            struct ggml_tensor * a,
@@ -1760,10 +1826,8 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_ssm_conv(
            struct ggml_context * ctx,
-            struct ggml_tensor  * s,
+            struct ggml_tensor  * sx,
-            struct ggml_tensor  * x,
+            struct ggml_tensor  * c);
            struct ggml_tensor  * c,
            struct ggml_tensor  * sq);
    GGML_API struct ggml_tensor * ggml_ssm_scan(
            struct ggml_context * ctx,
@@ -1772,8 +1836,7 @@ extern "C" {
            struct ggml_tensor  * dt,
            struct ggml_tensor  * A,
            struct ggml_tensor  * B,
-            struct ggml_tensor  * C,
+            struct ggml_tensor  * C);
            struct ggml_tensor  * sq);
    // partition into non-overlapping windows with padding if needed
    // example:
@@ -1825,6 +1888,15 @@ extern "C" {
            struct ggml_tensor  * pw,
            struct ggml_tensor  * ph);
    GGML_API struct ggml_tensor * ggml_rwkv_wkv(
            struct ggml_context * ctx,
            struct ggml_tensor  * k,
            struct ggml_tensor  * v,
            struct ggml_tensor  * r,
            struct ggml_tensor  * tf,
            struct ggml_tensor  * td,
            struct ggml_tensor  * state);
    // custom operators
    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1908,7 +1980,8 @@ extern "C" {
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
-    #define GGML_N_TASKS_MAX -1
+#define GGML_N_TASKS_MAX (-1)
    // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
    GGML_API struct ggml_tensor * ggml_map_custom1(
            struct ggml_context   * ctx,
@@ -1961,44 +2034,84 @@ extern "C" {
    // loss function
    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
-            struct ggml_context         * ctx,
+            struct ggml_context * ctx,
-            struct ggml_tensor          * a,
+            struct ggml_tensor  * a,  // logits
-            struct ggml_tensor          * b);
+            struct ggml_tensor  * b); // labels
    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
-            struct ggml_context         * ctx,
+            struct ggml_context * ctx,
-            struct ggml_tensor          * a,
+            struct ggml_tensor  * a,  // logits
-            struct ggml_tensor          * b,
+            struct ggml_tensor  * b,  // labels
-            struct ggml_tensor          * c);
+            struct ggml_tensor  * c); // gradients of cross_entropy_loss result
    // AdamW optimizer step
    // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
    // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
    GGML_API struct ggml_tensor * ggml_opt_step_adamw(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * grad,
            float                 alpha,
            float                 beta1,
            float                 beta2,
            float                 eps,
            float                 wd); // weight decay
    //
    // automatic differentiation
    //
-    GGML_API void ggml_set_param(
+    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
-            struct ggml_context * ctx,
+    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
            struct ggml_tensor  * tensor);
    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
    GGML_API void ggml_build_opt_adamw(
            struct ggml_context * ctx,
            struct ggml_cgraph  * gf,
            struct ggml_cgraph  * gb,
            float                 alpha,
            float                 beta1,
            float                 beta2,
            float                 eps,
            float                 wd); // weight decay
    // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
+    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
+
    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan            (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API struct ggml_cplan ggml_graph_plan(
-    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+                  const struct ggml_cgraph * cgraph,
                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
                    struct ggml_threadpool * threadpool /* = NULL */ );
    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
@@ -2062,6 +2175,10 @@ extern "C" {
    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
    // optimization parameters
    //
    //   see ggml.c (ggml_opt_default_params) for default values
@@ -2387,10 +2504,16 @@ extern "C" {
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_riscv_v    (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_matmul_int8(void);
    GGML_API int ggml_cpu_has_cann       (void);
    GGML_API int ggml_cpu_has_llamafile  (void);
    // get the sve vector length in bytes
    GGML_API int ggml_cpu_get_sve_cnt(void);
    //
    // Internal types and functions exposed for tests and benchmarks
@@ -2404,20 +2527,31 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+    typedef void (*ggml_from_float_to_mat_t)
-                                      const void * GGML_RESTRICT y, size_t by, int nrc);
+                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                       const void * GGML_RESTRICT y, int nr, int nc);
    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                       const void * GGML_RESTRICT y, int nr, int nc);
    typedef struct {
-        const char      * type_name;
+        const char             * type_name;
-        int               blck_size;
+        int64_t                  blck_size;
-        size_t            type_size;
+        int64_t                  blck_size_interleave; // interleave elements in blocks
-        bool              is_quantized;
+        size_t                   type_size;
-        ggml_to_float_t   to_float;
+        bool                     is_quantized;
-        ggml_from_float_t from_float;
+        ggml_to_float_t          to_float;
-        ggml_from_float_t from_float_reference;
+        ggml_from_float_t        from_float;
-        ggml_vec_dot_t    vec_dot;
+        ggml_from_float_t        from_float_ref;
-        enum ggml_type    vec_dot_type;
+        ggml_from_float_to_mat_t from_float_to_mat;
-        int64_t           nrows; // number of rows to process simultaneously;
+        ggml_vec_dot_t           vec_dot;
        enum ggml_type           vec_dot_type;
        int64_t                  nrows; // number of rows to process simultaneously
        int64_t                  ncols; // number of columns to process simultaneously
        ggml_gemv_t              gemv;
        ggml_gemm_t              gemm;
    } ggml_type_traits_t;
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -26,6 +26,9 @@ if (NOT MSVC)
    endif()
 endif()
 unset(GGML_EXTRA_LIBS_PRIVATE)
 unset(GGML_EXTRA_LIBS_PUBLIC)
 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    if (ACCELERATE_FRAMEWORK)
@@ -35,7 +38,7 @@ if (APPLE AND GGML_ACCELERATE)
        add_compile_definitions(ACCELERATE_NEW_LAPACK)
        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
    else()
        message(WARNING "Accelerate framework not found")
    endif()
@@ -87,7 +90,7 @@ if (GGML_METAL)
            COMMENT "Generate assembly for embedded Metal library"
        )
-        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
+        list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
    else()
        if (GGML_METAL_SHADER_DEBUG)
            # custom command to do the following:
@@ -132,13 +135,24 @@ if (GGML_METAL)
            )
    endif() # GGML_METAL_EMBED_LIBRARY
-    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
+    list(APPEND GGML_EXTRA_LIBS_PRIVATE
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
        )
 endif()
 if (GGML_MUSA)
    set(CMAKE_C_COMPILER clang)
    set(CMAKE_C_EXTENSIONS OFF)
    set(CMAKE_CXX_COMPILER clang++)
    set(CMAKE_CXX_EXTENSIONS OFF)
    set(GGML_CUDA ON)
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
 endif()
 if (GGML_OPENMP)
    find_package(OpenMP)
    if (OpenMP_FOUND)
@@ -146,7 +160,12 @@ if (GGML_OPENMP)
        add_compile_definitions(GGML_USE_OPENMP)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
        if (GGML_MUSA)
            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-10/include/openmp")
            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
        endif()
    else()
        message(WARNING "OpenMP not found")
    endif()
@@ -228,8 +247,8 @@ if (GGML_BLAS)
        set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
        set(GGML_SOURCES_BLAS ggml-blas.cpp)
-        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${BLAS_LIBRARIES})
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
-        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
+        list(APPEND GGML_EXTRA_INCLUDES     ${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@@ -238,18 +257,24 @@ if (GGML_BLAS)
 endif()
 if (GGML_LLAMAFILE)
-    message(STATUS "Using ggml SGEMM")
+    message(STATUS "Using llamafile")
    add_compile_definitions(GGML_USE_LLAMAFILE)
-    set(GGML_HEADERS_LLAMAFILE sgemm.h)
+    set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
+    set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
 endif()
 if (GGML_CUDA)
    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-    find_package(CUDAToolkit)
+    if (GGML_MUSA)
        list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
        find_package(MUSAToolkit)
        set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
    else()
        find_package(CUDAToolkit)
    endif()
    if (CUDAToolkit_FOUND)
        message(STATUS "CUDA found")
@@ -268,7 +293,11 @@ if (GGML_CUDA)
        endif()
        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-        enable_language(CUDA)
+        if (GGML_MUSA)
            set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
        else()
            enable_language(CUDA)
        endif()
        file(GLOB   GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
        list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@@ -295,21 +324,15 @@ if (GGML_CUDA)
        list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
        # TODO: for now CUDA graphs should be used only with llama.cpp
        #       https://github.com/ggerganov/whisper.cpp/issues/2258
        message(STATUS "CMAKE_PROJECT_NAME: ${CMAKE_PROJECT_NAME}")
        if (CMAKE_PROJECT_NAME STREQUAL "llama.cpp")
            add_compile_definitions(GGML_CUDA_USE_GRAPHS)
            message(STATUS "GGML_CUDA_USE_GRAPHS enabled")
        else()
            message(STATUS "GGML_CUDA_USE_GRAPHS disabled")
        endif()
        add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
        add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
        if (GGML_CUDA_GRAPHS)
            add_compile_definitions(GGML_CUDA_USE_GRAPHS)
        endif()
        if (GGML_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@@ -338,21 +361,40 @@ if (GGML_CUDA)
            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
        endif()
        if (GGML_MUSA)
            set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
            foreach(SOURCE ${GGML_SOURCES_CUDA})
                set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
            endforeach()
        endif()
        if (GGML_STATIC)
            if (WIN32)
                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
-                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+                if (GGML_MUSA)
                    list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
                else()
                    list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
                endif()
            endif()
        else()
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+            if (GGML_MUSA)
                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
            else()
                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
            endif()
        endif()
        if (GGML_CUDA_NO_VMM)
            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
        else()
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+            if (GGML_MUSA)
                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
            else()
                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
            endif()
        endif()
    else()
        message(WARNING "CUDA not found")
@@ -446,13 +488,17 @@ if (GGML_HIPBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()
    if (GGML_CUDA_FORCE_CUBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
    endif()
    if (GGML_CUDA_NO_PEER_COPY)
        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
    endif()
    if (CXX_IS_HIPCC)
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
    else()
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
    endif()
@@ -461,27 +507,34 @@ if (GGML_HIPBLAS)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()
-    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
+    list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 if (GGML_SYCL)
-    if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
+    if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
-        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
+        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
    endif()
-    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
-        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+
    if (DEFINED ENV{ONEAPI_ROOT})
        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
    elseif(SUPPORTS_SYCL)
        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
         source /opt/intel/oneapi/setvars.sh")
    else()
        message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
    endif()
    #todo: AOT
    find_package(IntelSYCL REQUIRED)
    find_package(MKL REQUIRED)
    message(STATUS "SYCL found")
    #todo: AOT
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
    if (GGML_SYCL_F16)
        if (GGML_SYCL_TARGET STREQUAL "AMD")
            message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
        endif()
        add_compile_definitions(GGML_SYCL_F16)
    endif()
@@ -489,12 +542,18 @@ if (GGML_SYCL)
        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
    endif()
-    add_compile_options(-I./) #include DPCT
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
    elseif (GGML_SYCL_TARGET STREQUAL "AMD")
        # INFO: Allowed Sub_group_sizes are not consistent through all
        # hip targets. For example, 64 is used for certain models, but the backend
        # does not support it.
        # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
    else()
        add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
    endif()
    file(GLOB   GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
@@ -503,16 +562,35 @@ if (GGML_SYCL)
    file(GLOB   GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
-    if (WIN32)
+    find_package(DNNL)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+    message("-- DNNL found:" ${DNNL_FOUND})
    else()
        add_compile_options(-I/${SYCL_INCLUDE_DIR})
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
    if (GGML_SYCL_TARGET STREQUAL "INTEL")
        add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
    else()
        add_compile_definitions(GGML_SYCL_DNNL=0)
    endif()
    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
        list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
    endif()
    if (WIN32)
        find_package(IntelSYCL REQUIRED)
        find_package(MKL REQUIRED)
        list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
        if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
        elseif (GGML_SYCL_TARGET STREQUAL "AMD")
            if (GGML_SYCL_HIP_TARGET STREQUAL "")
                message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
            endif()
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
        endif()
    endif()
 endif()
@@ -523,7 +601,7 @@ if (GGML_RPC)
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
    if (WIN32)
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
    endif()
    set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@@ -531,14 +609,11 @@ if (GGML_RPC)
 endif()
 if (GGML_VULKAN)
-    find_package(Vulkan)
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
    if (Vulkan_FOUND)
        message(STATUS "Vulkan found")
        set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h)
        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
        list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
@@ -559,6 +634,14 @@ if (GGML_VULKAN)
            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
        endif()
        if (GGML_VULKAN_SHADER_DEBUG_INFO)
            add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
        endif()
        if (GGML_VULKAN_PERF)
            add_compile_definitions(GGML_VULKAN_PERF)
        endif()
        if (GGML_VULKAN_VALIDATE)
            add_compile_definitions(GGML_VULKAN_VALIDATE)
        endif()
@@ -567,7 +650,37 @@ if (GGML_VULKAN)
            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
        endif()
-        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
+        add_subdirectory(vulkan-shaders)
        set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
        set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
        set (_ggml_vk_source     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
        set (_ggml_vk_input_dir  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
        set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
        file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
        add_custom_command(
            OUTPUT ${_ggml_vk_header}
                   ${_ggml_vk_source}
            COMMAND ${_ggml_vk_genshaders_cmd}
                --glslc      ${Vulkan_GLSLC_EXECUTABLE}
                --input-dir  ${_ggml_vk_input_dir}
                --output-dir ${_ggml_vk_output_dir}
                --target-hpp ${_ggml_vk_header}
                --target-cpp ${_ggml_vk_source}
                --no-clean
            DEPENDS ${_ggml_vk_shader_deps}
            COMMENT "Generate vulkan shaders"
        )
        set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
        list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Vulkan not found")
    endif()
@@ -726,8 +839,8 @@ if (GGML_KOMPUTE)
        list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
-        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     kompute)
+        list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
-        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
+        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Kompute not found")
    endif()
@@ -743,6 +856,71 @@ if (GGML_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
 if (GGML_CANN)
    if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
        set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
        message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
    endif()
    if (CANN_INSTALL_DIR)
        # Only Support Linux.
        if (GGML_CANN)
            if (NOT UNIX)
                set(GGML_CANN OFF)
                message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
            endif()
        endif()
        # Supported platforms: x86-64, arm64
        if (GGML_CANN)
            if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
            elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
            else()
                set(GGML_CANN OFF)
                message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
            endif()
        endif()
        # Set header and libs
        if(GGML_CANN)
            set(CANN_INCLUDE_DIRS
                ${CANN_INSTALL_DIR}/include
                ${CANN_INSTALL_DIR}/include/aclnn
                ${CANN_INSTALL_DIR}/acllib/include
            )
            add_subdirectory(ggml-cann/kernels)
            list(APPEND CANN_LIBRARIES
                ascendcl
                nnopbase
                opapi
                acl_op_compiler
                ascendc_kernels
            )
            set(GGML_HEADERS_CANN "../include/ggml-cann.h")
            file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
            list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
            list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
            list(APPEND GGML_EXTRA_INCLUDES     ${CANN_INCLUDE_DIRS})
            list(APPEND GGML_EXTRA_LIBDIRS      ${CANN_INSTALL_DIR}/lib64)
            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
        endif()
    else()
        set(GGML_CANN OFF)
        message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
    endif()
    if(NOT GGML_CANN)
        message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
    endif()
 endif()
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@@ -761,8 +939,10 @@ function(get_flags CCID CCVER)
        set(C_FLAGS   -Wdouble-promotion)
        set(CXX_FLAGS -Wno-array-bounds)
-        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
+        if (NOT GGML_MUSA)
-            list(APPEND CXX_FLAGS -Wno-format-truncation)
+            if (CCVER VERSION_GREATER_EQUAL 7.1.0)
                list(APPEND CXX_FLAGS -Wno-format-truncation)
            endif()
        endif()
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            list(APPEND CXX_FLAGS -Wextra-semi)
@@ -1021,6 +1201,7 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
        endif()
        if (GGML_AVX512)
            list(APPEND ARCH_FLAGS -mavx512f)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (GGML_AVX512_VBMI)
@@ -1094,7 +1275,7 @@ endif()
 # Data types, macros and functions related to controlling CPU affinity and
 # some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux")
+if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
    add_compile_definitions(_GNU_SOURCE)
 endif()
@@ -1144,7 +1325,7 @@ add_library(ggml
            ../include/ggml-backend.h
            ggml.c
            ggml-alloc.c
-            ggml-backend.c
+            ggml-backend.cpp
            ggml-quants.c
            ggml-quants.h
            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
@@ -1157,24 +1338,34 @@ add_library(ggml
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            ${GGML_SOURCES_CANN}      ${GGML_HEADERS_CANN}
            ggml-aarch64.c            ggml-aarch64.h
            )
 if (EMSCRIPTEN)
    set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
-target_compile_definitions(ggml PUBLIC  ${GGML_CDEF_PUBLIC})
+target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC ../include)
+target_include_directories(ggml PUBLIC  ../include)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
 target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump
-target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
+list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
-    target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
+    if (NOT WIN32 OR NOT GGML_SYCL)
        list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
    endif()
 endif()
 list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
 list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
 target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
 endif()
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@@ -0,0 +1,39 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "ggml.h"
 // GGML internal header
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Quantization
 void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 // GEMV
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 // GEMM
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@@ -91,8 +91,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
-        GGML_ASSERT(!"not enough space in the buffer");
+        GGML_ABORT("not enough space in the buffer");
        return;
    }
    void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@@ -133,7 +132,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
            return;
        }
    }
-    GGML_ASSERT(!"out of allocated_tensors");
+    GGML_ABORT("out of allocated_tensors");
 }
 static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
@@ -142,8 +141,7 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
            return;
        }
    }
-    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
+    GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
    GGML_ASSERT(!"tensor not found");
 }
 #endif
@@ -176,8 +174,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
            // this should never happen
            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                    __func__, size, max_avail);
-            GGML_ASSERT(!"not enough space in the buffer");
+            GGML_ABORT("not enough space in the buffer");
            GGML_UNREACHABLE();
        }
    }
@@ -297,6 +294,12 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
    alloc->free_blocks[0].offset = 0;
    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
    alloc->max_size = 0;
 #ifdef GGML_ALLOCATOR_DEBUG
    for (int i = 0; i < 1024; i++) {
        alloc->allocated_tensors[i].tensor = NULL;
    }
 #endif
 }
 static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
@@ -443,7 +446,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
        }
    }
-    free(galloc->hash_set.keys);
+    ggml_hash_set_free(&galloc->hash_set);
    free(galloc->hash_values);
    free(galloc->bufts);
    free(galloc->buffers);
@@ -456,7 +459,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 typedef struct ggml_gallocr * ggml_gallocr_t;
 static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
+    size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
    return &galloc->hash_values[i];
 }
@@ -565,8 +568,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
 static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
    // clear hash tables
-    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
+    ggml_hash_set_reset(&galloc->hash_set);
-    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
+    memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
    // allocate leafs
    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@@ -671,21 +674,19 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
 }
 bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
-    size_t hash_size = graph->visited_hash_table.size;
+    size_t min_hash_size = graph->n_nodes + graph->n_leafs;
    // add 25% margin to avoid hash collisions
    min_hash_size += min_hash_size / 4;
    // initialize hash table
-    if (galloc->hash_set.size < hash_size) {
+    if (galloc->hash_set.size < min_hash_size) {
-        free(galloc->hash_set.keys);
+        ggml_hash_set_free(&galloc->hash_set);
-        free(galloc->hash_values);
+        galloc->hash_set = ggml_hash_set_new(min_hash_size);
        galloc->hash_set.size = hash_size;
        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
        GGML_ASSERT(galloc->hash_set.keys != NULL);
        free(galloc->hash_values);
        galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
        GGML_ASSERT(galloc->hash_values != NULL);
    } else {
        // reset hash table
        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
    }
    // reset allocators
@@ -776,6 +777,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                return false;
            }
            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }
@@ -816,8 +818,7 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
+    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
    return talloc->size_max >= node_size;
 }
--- a/Show More
+++ b/Show More