ggml : disable CUDA graphs for non-llama.cpp projects

2025-07-01 23:10:47 +02:00 · 2024-06-26 20:14:22 +03:00
259 changed files with 167023 additions and 52420 deletions
--- a/.devops/cublas.Dockerfile
+++ b/.devops/cublas.Dockerfile
@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
-    apt-get install -y build-essential git cmake libsdl2-dev
+    apt-get install -y build-essential git cmake
 WORKDIR /app
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -17,7 +17,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 ENV GGML_CUDA=1
 RUN apt-get update && \
-    apt-get install -y build-essential libsdl2-dev \
+    apt-get install -y build-essential \
    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 # Ref: https://stackoverflow.com/a/53464012
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -12,7 +12,7 @@ FROM ubuntu:22.04 AS runtime
 WORKDIR /app
 RUN apt-get update && \
-  apt-get install -y curl ffmpeg libsdl2-dev \
+  apt-get install -y curl ffmpeg \
  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 COPY --from=build /app /app
--- a/.github/workflows/bindings-go.yml
+++ b/.github/workflows/bindings-go.yml
@ -13,10 +13,10 @@ jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/setup-go@v5
+      - uses: actions/setup-go@v3
        with:
-          go-version: '^1.23'
+          go-version: '^1.19'
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v1
      - run: |
          cd bindings/go
          make test
--- a/.github/workflows/bindings-ruby.yml.disabled
+++ b/.github/workflows/bindings-ruby.yml.disabled
@ -1,4 +1,3 @@
 # TODO: fix this workflow file, disabled for now
 name: Bindings Tests (Ruby)
 on:
  push:
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -59,7 +59,7 @@ jobs:
        uses: cross-platform-actions/action@v0.24.0
        with:
          operating_system: freebsd
-          version: '13.3'
+          version: '13.2'
          run: |
            sudo pkg update
            sudo pkg install -y gmake sdl2
@ -586,75 +586,73 @@ jobs:
          cd whisper/examples/whisper.android
          ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
-# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
+  android_java:
-#  android_java:
+    runs-on: ubuntu-latest
 #    runs-on: ubuntu-latest
 #
 #    steps:
 #      - name: Clone
 #        uses: actions/checkout@v4
 #
 #      - name: set up JDK 11
 #        uses: actions/setup-java@v4
 #        with:
 #          java-version: '11'
 #          distribution: 'temurin'
 #          cache: gradle
 #
 #      - name: Setup Android SDK
 #        uses: android-actions/setup-android@v3
 #        with:
 #          cmdline-tools-version: 9.0
 #
 #      - name: Build
 #        run: |
 #          cd examples/whisper.android.java
 #          chmod +x ./gradlew
 #          ./gradlew assembleRelease
-# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
+    steps:
-#  java:
+      - name: Clone
-#    needs: [ 'windows' ]
+        uses: actions/checkout@v4
-#    runs-on: windows-latest
+
-#    steps:
+      - name: set up JDK 11
-#      - uses: actions/checkout@v4
+        uses: actions/setup-java@v4
-#
+        with:
-#      - name: Install Java
+          java-version: '11'
-#        uses: actions/setup-java@v4
+          distribution: 'temurin'
-#        with:
+          cache: gradle
-#          distribution: zulu
+
-#          java-version: 20
+      - name: Setup Android SDK
-#
+        uses: android-actions/setup-android@v3
-#      - name: Download Windows lib
+        with:
-#        uses: actions/download-artifact@v4
+          cmdline-tools-version: 9.0
-#        with:
+
-#          name: win32-x86-64_whisper.dll
+      - name: Build
-#          path: bindings/java/build/generated/resources/main/win32-x86-64
+        run: |
-#
+          cd examples/whisper.android.java
-#      - name: Build
+          chmod +x ./gradlew
-#        run: |
+          ./gradlew assembleRelease
-#          models\download-ggml-model.cmd tiny.en
+
-#          cd bindings/java
+  java:
-#          chmod +x ./gradlew
+    needs: [ 'windows' ]
-#          ./gradlew build
+    runs-on: windows-latest
-#
+    steps:
-#      - name: Upload jar
+      - uses: actions/checkout@v4
-#        uses: actions/upload-artifact@v4
+
-#        with:
+      - name: Install Java
-#          name: whispercpp.jar
+        uses: actions/setup-java@v4
-#          path: bindings/java/build/libs/whispercpp-*.jar
+        with:
-#
+          distribution: zulu
-#      - name: Publish package
+          java-version: 20
-#        if: ${{ github.ref == 'refs/heads/master' }}
+
-#        uses: gradle/gradle-build-action@v2.4.2
+      - name: Download Windows lib
-#        with:
+        uses: actions/download-artifact@v4
-#          arguments: publish
+        with:
-#          build-root-directory: bindings/java
+          name: win32-x86-64_whisper.dll
-#        env:
+          path: bindings/java/build/generated/resources/main/win32-x86-64
-#          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
+
-#          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
+      - name: Build
-#          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
+        run: |
-#          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
+          models\download-ggml-model.cmd tiny.en
          cd bindings/java
          chmod +x ./gradlew
          ./gradlew build
      - name: Upload jar
        uses: actions/upload-artifact@v4
        with:
          name: whispercpp.jar
          path: bindings/java/build/libs/whispercpp-*.jar
      - name: Publish package
        if: ${{ github.ref == 'refs/heads/master' }}
        uses: gradle/gradle-build-action@v2.4.2
        with:
          arguments: publish
          build-root-directory: bindings/java
        env:
          MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
          MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
          PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
          PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
  quantize:
    runs-on: ubuntu-latest
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -18,9 +18,7 @@ jobs:
      matrix:
        config:
          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
-          #TODO: the cuda image keeps failing - disable for now
+          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
          #      https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
          #- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
    steps:
      - name: Check out the repo
--- a/.gitignore
+++ b/.gitignore
@ -3,13 +3,11 @@
 .cache/
 .coreml/
 .test/
 .venv/
 .vs/
 .vscode/
 .DS_Store
 .vimspector.json
 /CMakeSettings.json
 /talk-llama.dSYM/
 build/
 build-*/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
 project("whisper.cpp" C CXX)
-project("whisper.cpp" VERSION 1.7.1)
+project("whisper.cpp" VERSION 1.6.2)
 include(CheckIncludeFileCXX)
 set(SOVERSION 1)
@ -120,10 +120,7 @@ whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
 # build the library
 #
-if (NOT TARGET ggml)
+add_subdirectory(ggml)
    add_subdirectory(ggml)
    # ... otherwise assume ggml is added by a parent CMakeLists.txt
 endif()
 add_subdirectory(src)
 #
@ -164,6 +161,18 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
 install(
    FILES convert-hf-to-gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
        OWNER_EXECUTE
        GROUP_READ
        GROUP_EXECUTE
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
 configure_file(cmake/whisper.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
        @ONLY)
--- a/66
+++ b/66
@ -3,11 +3,12 @@ BUILD_TARGETS = \
 	main \
 	bench \
 	quantize \
-	server
+	server \
 	tests/test-c.o
 # Binaries only useful for tests
 TEST_TARGETS = \
-	tests/test-c.o
+	tests/test-backend-ops
 # Deprecation aliases
 ifdef WHISPER_CUBLAS
@ -140,8 +141,8 @@ else
 		command \
 		stream \
 		lsp \
 		talk \
 		talk-llama
 	# talk (TODO: disalbed)
 endif
 default: $(BUILD_TARGETS)
@ -250,10 +251,7 @@ ifdef WHISPER_DEBUG
 		MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
 	endif
 else
-	MK_CPPFLAGS   += -DNDEBUG
+	MK_CPPFLAGS += -DNDEBUG
 	MK_CFLAGS     += -O3
 	MK_CXXFLAGS   += -O3
 	MK_NVCCFLAGS  += -O3
 endif
 ifdef WHISPER_SANITIZE_THREAD
@ -503,15 +501,16 @@ ifdef GGML_CUDA
 		CUDA_PATH ?= /usr/local/cuda
 	endif
-	#MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
+	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
-	#MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
+	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_CPPFLAGS  += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	MK_LDFLAGS   += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
 	MK_NVCCFLAGS += -use_fast_math
 	OBJ_GGML += ggml/src/ggml-cuda.o
 	OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
 	OBJ_GGML += $(OBJ_CUDA_TMPL)
 	OBJ_WHISPER += src/whisper-mel-cuda.o
 ifdef WHISPER_FATAL_WARNINGS
 	MK_NVCCFLAGS += -Werror all-warnings
 endif # WHISPER_FATAL_WARNINGS
@ -620,6 +619,10 @@ ggml/src/ggml-cuda.o: \
 	ggml/src/ggml-common.h \
 	$(wildcard ggml/src/ggml-cuda/*.cuh)
 	$(NVCC_COMPILE)
 src/whisper-mel-cuda.o: src/whisper-mel-cuda.cu src/whisper-mel-cuda.hpp
 	$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
 endif # GGML_CUDA
 ifdef GGML_VULKAN
@ -777,8 +780,7 @@ OBJ_GGML += \
 	ggml/src/ggml.o \
 	ggml/src/ggml-alloc.o \
 	ggml/src/ggml-backend.o \
-	ggml/src/ggml-quants.o \
+	ggml/src/ggml-quants.o
 	ggml/src/ggml-aarch64.o
 OBJ_WHISPER += \
 	src/whisper.o
@ -897,10 +899,10 @@ ggml/src/ggml-alloc.o: \
 	$(CC)  $(CFLAGS)   -c $< -o $@
 ggml/src/ggml-backend.o: \
-	ggml/src/ggml-backend.cpp \
+	ggml/src/ggml-backend.c \
 	ggml/include/ggml.h \
 	ggml/include/ggml-backend.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	$(CC)  $(CFLAGS)   -c $< -o $@
 ggml/src/ggml-quants.o: \
 	ggml/src/ggml-quants.c \
@ -909,13 +911,6 @@ ggml/src/ggml-quants.o: \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml/src/ggml-aarch64.o: \
 	ggml/src/ggml-aarch64.c \
 	ggml/include/ggml.h \
 	ggml/src/ggml-aarch64.h \
 	ggml/src/ggml-common.h
 	$(CC) $(CFLAGS)    -c $< -o $@
 ggml/src/ggml-blas.o: \
 	ggml/src/ggml-blas.cpp \
 	ggml/include/ggml-blas.h
@ -948,6 +943,7 @@ $(LIB_GGML_S): \
 src/whisper.o: \
 	src/whisper.cpp \
 	src/whisper-mel.hpp \
 	include/whisper.h \
 	ggml/include/ggml.h \
 	ggml/include/ggml-alloc.h \
@ -962,8 +958,7 @@ $(LIB_WHISPER): \
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 $(LIB_WHISPER_S): \
-	$(OBJ_WHISPER) \
+	$(OBJ_WHISPER)
 	$(OBJ_GGML)
 	ar rcs $(LIB_WHISPER_S) $^
 # common
@ -1040,6 +1035,9 @@ main: examples/main/main.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./llama-cli -h for help.  ===='
 	@echo
 bench: examples/bench/bench.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
@ -1071,14 +1069,12 @@ lsp: examples/lsp/lsp.cpp \
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
-# TODO: disabled until update
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
-#       https://github.com/ggerganov/whisper.cpp/issues/1818
+	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
-#talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
+	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
-#	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
 #	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 #	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/llama-vocab.cpp examples/talk-llama/llama-grammar.cpp examples/talk-llama/llama-sampling.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
 	$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
 	$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
@ -1092,6 +1088,11 @@ tests: $(TEST_TARGETS)
 tests/test-c.o: tests/test-c.c include/whisper.h
 	$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
 tests/test-backend-ops: tests/test-backend-ops.cpp \
 	$(OBJ_GGML)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 #
 # Audio samples
 #
@ -1137,9 +1138,8 @@ samples:
 .PHONY: large-v1
 .PHONY: large-v2
 .PHONY: large-v3
 .PHONY: large-v3-turbo
-tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
 	bash ./models/download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
--- a/Package.swift
+++ b/Package.swift
@ -32,9 +32,8 @@ let package = Package(
            sources: [
                "ggml/src/ggml.c",
                "src/whisper.cpp",
                "ggml/src/ggml-aarch64.c",
                "ggml/src/ggml-alloc.c",
-                "ggml/src/ggml-backend.cpp",
+                "ggml/src/ggml-backend.c",
                "ggml/src/ggml-quants.c",
                "ggml/src/ggml-metal.m"
            ],
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.6.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.6.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -21,8 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - Support for CPU-only inference
 - [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
- [Ascend NPU Support](https://github.com/ggerganov/whisper.cpp#ascend-npu-support)
+- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
 Supported platforms:
@ -34,9 +33,9 @@ Supported platforms:
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
+- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
-The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
+The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
 The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
@ -56,8 +55,8 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 ## Implementation details
- The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c))
+- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp))
+- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
 - Sample usage is demonstrated in [main.cpp](examples/main)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder
@ -75,7 +74,7 @@ git clone https://github.com/ggerganov/whisper.cpp.git
 Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
 ```bash
-sh ./models/download-ggml-model.sh base.en
+bash ./models/download-ggml-model.sh base.en
 ```
 Now build the [main](examples/main) example and transcribe an audio file like this:
@ -146,7 +145,7 @@ options:
  -ng,       --no-gpu            [false  ] disable GPU
-sh ./models/download-ggml-model.sh base.en
+bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
 ggml-base.en.bin               100%[========================>] 141.11M  6.34MB/s    in 24s
 Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
@ -236,7 +235,6 @@ make medium
 make large-v1
 make large-v2
 make large-v3
 make large-v3-turbo
 ```
 ## Memory usage
@ -450,39 +448,6 @@ cmake -DWHISPER_MKL=ON ..
 WHISPER_MKL=1 make -j
 ```
 ## Ascend NPU support
 Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores. 
 First, check if your Ascend NPU device is supported:
 **Verified devices**
 | Ascend NPU                    | Status  |
 |:-----------------------------:|:-------:|
 | Atlas 300T A2                 | Support |
 Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
 Now build `whisper.cpp` with CANN support:
 ```
 mkdir build
 cd build
 cmake .. -D GGML_CANN=on
 make -j
 ```
 Run the inference examples as usual, for example:
 ```
 ./build/bin/main -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
 ```
 *Notes:*
 - If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
 - If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
 ## Docker
 ### Prerequisites
@ -786,7 +751,7 @@ took to execute it. The results are summarized in the following Github issue:
 [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
-Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
+Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](bench.py).
 You can run it with the following command, by default it will run against any standard model in the models folder.
@ -833,7 +798,6 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
  - [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
 - [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 - [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -14,14 +14,9 @@ GGML_METAL_PATH_RESOURCES := $(abspath ../..)
 BUILD_DIR := build
 MODELS_DIR := models
 EXAMPLES_DIR := $(wildcard examples/*)
-INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
+INCLUDE_PATH := $(abspath ../..)
 LIBRARY_PATH := $(abspath ../..)
 ifeq ($(GGML_CUDA),1)
 	LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
 	BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
 endif
 ifeq ($(UNAME_S),Darwin)
 	EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
 endif
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@ -62,12 +62,6 @@ This will compile a static `libwhisper.a` in a `build` folder, download a model
 make examples
 ```
 To build using cuda support add `GGML_CUDA=1`:
 ```bash
 GGML_CUDA=1 make examples
 ```
 The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
 ```bash
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -24,7 +24,7 @@ const (
 var (
 	// The models which will be downloaded, if no model is specified as an argument
-	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"}
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
 )
 var (
--- a/bindings/go/go.mod
+++ b/bindings/go/go.mod
@ -1,10 +1,10 @@
 module github.com/ggerganov/whisper.cpp/bindings/go
-go 1.23
+go 1.19
 require (
 	github.com/go-audio/wav v1.1.0
-	github.com/stretchr/testify v1.9.0
+	github.com/stretchr/testify v1.8.1
 )
 require (
--- a/bindings/go/go.sum
+++ b/bindings/go/go.sum
@ -1,3 +1,4 @@
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@ -8,9 +9,15 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
 github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
-github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
 github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
 github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -119,28 +119,6 @@ func (p *Params) SetAudioCtx(n int) {
 	p.audio_ctx = C.int(n)
 }
 func (p *Params) SetMaxContext(n int) {
 	p.n_max_text_ctx = C.int(n)
 }
 func (p *Params) SetBeamSize(n int) {
 	p.beam_search.beam_size = C.int(n)
 }
 func (p *Params) SetEntropyThold(t float32) {
 	p.entropy_thold = C.float(t)
 }
 func (p *Params) SetTemperature(t float32) {
 	p.temperature = C.float(t)
 }
 // Sets the fallback temperature incrementation
 // Pass -1.0 to disable this feature
 func (p *Params) SetTemperatureFallback(t float32) {
 	p.temperature_inc = C.float(t)
 }
 // Set initial prompt
 func (p *Params) SetInitialPrompt(prompt string) {
 	p.initial_prompt = C.CString(prompt)
@ -171,10 +149,6 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
 	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
 	str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
 	str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
 	str += fmt.Sprintf(" temperature=%f", p.temperature)
 	str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
 	str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
 	if p.translate {
 		str += " translate"
 	}
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -125,32 +125,6 @@ func (context *context) SetAudioCtx(n uint) {
 	context.params.SetAudioCtx(int(n))
 }
 // Set maximum number of text context tokens to store
 func (context *context) SetMaxContext(n int) {
 	context.params.SetMaxContext(n)
 }
 // Set Beam Size
 func (context *context) SetBeamSize(n int) {
 	context.params.SetBeamSize(n)
 }
 // Set Entropy threshold
 func (context *context) SetEntropyThold(t float32) {
 	context.params.SetEntropyThold(t)
 }
 // Set Temperature
 func (context *context) SetTemperature(t float32) {
 	context.params.SetTemperature(t)
 }
 // Set the fallback temperature incrementation
 // Pass -1.0 to disable this feature
 func (context *context) SetTemperatureFallback(t float32) {
 	context.params.SetTemperatureFallback(t)
 }
 // Set initial prompt
 func (context *context) SetInitialPrompt(prompt string) {
 	context.params.SetInitialPrompt(prompt)
--- a/bindings/go/pkg/whisper/context_test.go
+++ b/bindings/go/pkg/whisper/context_test.go
@ -4,90 +4,52 @@ import (
 	"os"
 	"testing"
-	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	// Packages
-	"github.com/go-audio/wav"
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	assert "github.com/stretchr/testify/assert"
 )
-func TestSetLanguage(t *testing.T) {
+const (
-	assert := assert.New(t)
+	ModelPath  = "../../models/ggml-tiny.bin"
 	SamplePath = "../../samples/jfk.wav"
 )
 func Test_Whisper_000(t *testing.T) {
 	assert := assert.New(t)
 	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
 		t.Skip("Skipping test, model not found:", ModelPath)
 	}
 	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
 		t.Skip("Skipping test, sample not found:", SamplePath)
 	}
 	// Load model
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	assert.NoError(model.Close())
 	t.Log("languages=", model.Languages())
 }
 func Test_Whisper_001(t *testing.T) {
 	assert := assert.New(t)
 	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
 		t.Skip("Skipping test, model not found:", ModelPath)
 	}
 	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
 		t.Skip("Skipping test, sample not found:", SamplePath)
 	}
 	// Load model
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
-	context, err := model.NewContext()
+	// Get context for decoding
 	ctx, err := model.NewContext()
 	assert.NoError(err)
 	assert.NotNil(ctx)
 	// This returns an error since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	err = context.SetLanguage("en")
 	assert.Error(err)
 }
 func TestContextModelIsMultilingual(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	isMultilingual := context.IsMultilingual()
 	// This returns false since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	assert.False(isMultilingual)
 }
 func TestLanguage(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	// This always returns en since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	expectedLanguage := "en"
 	actualLanguage := context.Language()
 	assert.Equal(expectedLanguage, actualLanguage)
 }
 func TestProcess(t *testing.T) {
 	assert := assert.New(t)
 	fh, err := os.Open(SamplePath)
 	assert.NoError(err)
 	defer fh.Close()
 	// Decode the WAV file - load the full buffer
 	dec := wav.NewDecoder(fh)
 	buf, err := dec.FullPCMBuffer()
 	assert.NoError(err)
 	assert.Equal(uint16(1), dec.NumChans)
 	data := buf.AsFloat32Buffer().Data
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	err = context.Process(data, nil, nil)
 	assert.NoError(err)
 }
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -38,22 +38,17 @@ type Context interface {
 	IsMultilingual() bool     // Return true if the model is multilingual.
 	Language() string         // Get language
-	SetOffset(time.Duration)          // Set offset
+	SetOffset(time.Duration)        // Set offset
-	SetDuration(time.Duration)        // Set duration
+	SetDuration(time.Duration)      // Set duration
-	SetThreads(uint)                  // Set number of threads to use
+	SetThreads(uint)                // Set number of threads to use
-	SetSplitOnWord(bool)              // Set split on word flag
+	SetSplitOnWord(bool)            // Set split on word flag
-	SetTokenThreshold(float32)        // Set timestamp token probability threshold
+	SetTokenThreshold(float32)      // Set timestamp token probability threshold
-	SetTokenSumThreshold(float32)     // Set timestamp token sum probability threshold
+	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
-	SetMaxSegmentLength(uint)         // Set max segment length in characters
+	SetMaxSegmentLength(uint)       // Set max segment length in characters
-	SetTokenTimestamps(bool)          // Set token timestamps flag
+	SetTokenTimestamps(bool)        // Set token timestamps flag
-	SetMaxTokensPerSegment(uint)      // Set max tokens per segment (0 = no limit)
+	SetMaxTokensPerSegment(uint)    // Set max tokens per segment (0 = no limit)
-	SetAudioCtx(uint)                 // Set audio encoder context
+	SetAudioCtx(uint)               // Set audio encoder context
-	SetMaxContext(n int)              // Set maximum number of text context tokens to store
+	SetInitialPrompt(prompt string) // Set initial prompt
 	SetBeamSize(n int)                // Set Beam Size
 	SetEntropyThold(t float32)        // Set Entropy threshold
 	SetInitialPrompt(prompt string)   // Set initial prompt
 	SetTemperature(t float32)         // Set temperature
 	SetTemperatureFallback(t float32) // Set temperature incrementation
 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the
--- a/bindings/go/pkg/whisper/model_test.go
+++ b/bindings/go/pkg/whisper/model_test.go
@ -1,91 +0,0 @@
 package whisper_test
 import (
 	"testing"
 	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
 	assert "github.com/stretchr/testify/assert"
 )
 func TestNew(t *testing.T) {
 	assert := assert.New(t)
 	t.Run("valid model path", func(t *testing.T) {
 		model, err := whisper.New(ModelPath)
 		assert.NoError(err)
 		assert.NotNil(model)
 		defer model.Close()
 	})
 	t.Run("invalid model path", func(t *testing.T) {
 		invalidModelPath := "invalid-model-path.bin"
 		model, err := whisper.New(invalidModelPath)
 		assert.Error(err)
 		assert.Nil(model)
 	})
 }
 func TestClose(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	err = model.Close()
 	assert.NoError(err)
 }
 func TestNewContext(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	context, err := model.NewContext()
 	assert.NoError(err)
 	assert.NotNil(context)
 }
 func TestIsMultilingual(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	isMultilingual := model.IsMultilingual()
 	// This returns false since
 	// the model 'models/ggml-small.en.bin'
 	// that is loaded is not multilingual
 	assert.False(isMultilingual)
 }
 func TestLanguages(t *testing.T) {
 	assert := assert.New(t)
 	model, err := whisper.New(ModelPath)
 	assert.NoError(err)
 	assert.NotNil(model)
 	defer model.Close()
 	expectedLanguages := []string{
 		"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
 		"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
 		"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
 		"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
 		"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
 		"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
 		"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
 		"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
 		"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
 	}
 	actualLanguages := model.Languages()
 	assert.Equal(expectedLanguages, actualLanguages)
 }
--- a/bindings/go/pkg/whisper/util_test.go
+++ b/bindings/go/pkg/whisper/util_test.go
@ -1,6 +0,0 @@
 package whisper_test
 const (
 	ModelPath  = "../../models/ggml-small.en.bin"
 	SamplePath = "../../samples/jfk.wav"
 )
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -9,7 +9,7 @@ import (
 // CGO
 /*
-#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
+#cgo LDFLAGS: -lwhisper -lm -lstdc++
 #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
 #include <whisper.h>
 #include <stdlib.h>
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.7.1",
+  "version": "1.6.2",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -1,16 +1,15 @@
 require 'mkmf'
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-aarch64.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-aarch64.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
-system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.cpp')} .")
+system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
--- a/cmake/DefaultTargetOptions.cmake
+++ b/cmake/DefaultTargetOptions.cmake
@ -13,5 +13,5 @@ set_target_properties(${TARGET}
    PROPERTIES
        EXPORT_COMPILE_COMMANDS ON
        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
-        INSTALL_RPATH            "${CMAKE_INSTALL_PREFIX}/lib"
+        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
 )
--- a/cmake/FindFFmpeg.cmake
+++ b/cmake/FindFFmpeg.cmake
@ -36,7 +36,7 @@ include(FindPackageHandleStandardArgs)
 # The default components were taken from a survey over other FindFFMPEG.cmake files
 if (NOT FFmpeg_FIND_COMPONENTS)
-  set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE)
+  set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE) 
 endif()
 #
@ -84,7 +84,7 @@ macro(find_component _component _pkgconfig _library _header)
  # CMake's default is to search first for shared libraries and then for static libraries.
  # Todo later: add option to prefer static libs over dynamic:
-  find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a
+  find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a  
      HINTS
      ${PC_${_component}_LIBDIR}
      ${PC_${_component}_LIBRARY_DIRS}
--- a/cmake/whisper-config.cmake.in
+++ b/cmake/whisper-config.cmake.in
@ -1,7 +1,7 @@
-set(WHISPER_VERSION      @WHISPER_INSTALL_VERSION@)
+set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
-set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
+set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
-set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
+set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
-set(WHISPER_SHARED_LIB   @BUILD_SHARED_LIBS@)
+set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
@ -11,9 +11,9 @@ set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@
-set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@")
+set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
-set_and_check(WHISPER_LIB_DIR     "@PACKAGE_WHISPER_LIB_INSTALL_DIR@")
+set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
-set_and_check(WHISPER_BIN_DIR     "@PACKAGE_WHISPER_BIN_INSTALL_DIR@")
+set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 # Ensure transient dependencies satisfied
@ -43,23 +43,23 @@ if (GGML_HIPBLAS)
    find_package(rocblas REQUIRED)
 endif()
-find_library(whisper_LIBRARY whisper
+find_library(llama_LIBRARY llama
    REQUIRED
-    HINTS ${WHISPER_LIB_DIR})
+    HINTS ${LLAMA_LIB_DIR})
-set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@")
+set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
-set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@")
+set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
-add_library(whisper UNKNOWN IMPORTED)
+add_library(llama UNKNOWN IMPORTED)
-set_target_properties(whisper
+set_target_properties(llama
    PROPERTIES
-    INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}"
+        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
-        INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}"
+        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
-        INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}"
+        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
-        IMPORTED_LOCATION "${whisper_LIBRARY}"
+        IMPORTED_LOCATION "${llama_LIBRARY}"
        INTERFACE_COMPILE_FEATURES cxx_std_11
        POSITION_INDEPENDENT_CODE ON )
-check_required_components(whisper)
+check_required_components(Llama)
--- a/cmake/whisper.pc.in
+++ b/cmake/whisper.pc.in
@ -1,6 +1,6 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
-libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+libdir=${exec_prefix}/lib
 includedir=${prefix}/include
 Name: whisper
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -40,7 +40,7 @@ if (WHISPER_FFMPEG)
    message(STATUS "Found ffmpeg libs:       ${FFMPEG_LIBRARIES}")
    message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
    message(STATUS "ffmpeg definitions:      ${FFMPEG_DEFINITIONS}")
-    message(STATUS "Found avformat           ${AVFORMAT_VERSION}")
+    message(STATUS "Found avformat ${AVFORMAT_VERSION}")
    include_directories(${FFMPEG_INCLUDE_DIRS})
    add_compile_definitions(WHISPER_FFMPEG)
@ -102,8 +102,8 @@ if (EMSCRIPTEN)
    set_target_properties(libstream PROPERTIES FOLDER "libs")
    add_subdirectory(command.wasm)
    set_target_properties(libcommand PROPERTIES FOLDER "libs")
-    #add_subdirectory(talk.wasm)
+    add_subdirectory(talk.wasm)
-    #set_target_properties(libtalk PROPERTIES FOLDER "libs")
+    set_target_properties(libtalk PROPERTIES FOLDER "libs")
    add_subdirectory(bench.wasm)
    set_target_properties(libbench PROPERTIES FOLDER "libs")
 elseif(CMAKE_JS_VERSION)
@ -127,10 +127,8 @@ endif (WHISPER_SDL2)
    add_subdirectory(quantize)
    set_target_properties(quantize PROPERTIES FOLDER "examples")
 if (WHISPER_SDL2)
-    # TODO: disabled until update
+    add_subdirectory(talk)
-    #       https://github.com/ggerganov/whisper.cpp/issues/1818
+    set_target_properties(talk PROPERTIES FOLDER "examples")
    #add_subdirectory(talk)
    #set_target_properties(talk PROPERTIES FOLDER "examples")
    add_subdirectory(talk-llama)
    set_target_properties(talk-llama PROPERTIES FOLDER "examples")
    add_subdirectory(lsp)
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -72,9 +72,6 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_MOSTLY_IQ4_XS:
        case GGML_FTYPE_MOSTLY_IQ1_M:
        case GGML_FTYPE_MOSTLY_BF16:
        case GGML_FTYPE_MOSTLY_Q4_0_4_4:
        case GGML_FTYPE_MOSTLY_Q4_0_4_8:
        case GGML_FTYPE_MOSTLY_Q4_0_8_8:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -212,11 +209,6 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_BF16:
                case GGML_TYPE_Q4_0_4_4:
                case GGML_TYPE_Q4_0_4_8:
                case GGML_TYPE_Q4_0_8_8:
                case GGML_TYPE_TQ1_0:
                case GGML_TYPE_TQ2_0:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -147,6 +147,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
        default: return "To";
    }
    return "The";
--- a/examples/common.h
+++ b/examples/common.h
@ -9,7 +9,6 @@
 #include <thread>
 #include <ctime>
 #include <fstream>
 #include <sstream>
 #define COMMON_SAMPLE_RATE 16000
@ -287,43 +286,12 @@ void sam_print_usage(int argc, char ** argv, const sam_params & params);
 // Terminal utils
 //
 #define SQR(X)    ((X) * (X))
 #define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
-/**
+// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
- * Quantizes 24-bit RGB to xterm256 code range [16,256).
+// Lowest is red, middle is yellow, highest is green.
 */
 static int rgb2xterm256(int r, int g, int b) {
    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
    int av, ir, ig, ib, il, qr, qg, qb, ql;
    av = r * .299 + g * .587 + b * .114 + .5;
    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
    qr = cube[(ir = UNCUBE(r))];
    qg = cube[(ig = UNCUBE(g))];
    qb = cube[(ib = UNCUBE(b))];
    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
        return ir * 36 + ig * 6 + ib + 020;
    return il + 0350;
 }
 static std::string set_xterm256_foreground(int r, int g, int b) {
    int x = rgb2xterm256(r, g, b);
    std::ostringstream oss;
    oss << "\033[38;5;" << x << "m";
    return oss.str();
 }
 // Lowest is red, middle is yellow, highest is green. Color scheme from
 // Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
 const std::vector<std::string> k_colors = {
-    set_xterm256_foreground(220,   5,  12),
+    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
-    set_xterm256_foreground(232,  96,  28),
+    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
    set_xterm256_foreground(241, 147,  45),
    set_xterm256_foreground(246, 193,  65),
    set_xterm256_foreground(247, 240,  86),
    set_xterm256_foreground(144, 201, 135),
    set_xterm256_foreground( 78, 178, 101),
 };
 //
--- a/examples/dr_wav.h
+++ b/examples/dr_wav.h
--- a/examples/ffmpeg-transcode.cpp
+++ b/examples/ffmpeg-transcode.cpp
@ -321,7 +321,7 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
        LOG("Couldn't map input file %s\n", ifname.c_str());
        return err;
    }
-    LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size);
+    LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
    struct audio_buffer inaudio_buf;
    inaudio_buf.ptr = ibuf;
    inaudio_buf.size = ibuf_size;
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -48,7 +48,7 @@ if [ -n "$3" ]; then
 fi
 # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" "large-v3-turbo" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
 # list available models
 function list_models {
--- a/examples/python/whisper_processor.py
+++ b/examples/python/whisper_processor.py
@ -21,7 +21,7 @@ def process_audio(wav_file, model_name="base.en"):
    if not os.path.exists(wav_file):
        raise FileNotFoundError(f"WAV file not found: {wav_file}")
-    full_command = f"./main -m {model} -f {wav_file} -nt"
+    full_command = f"./main -m {model} -f {wav_file} -np -nt"
    # Execute the command
    process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -34,7 +34,6 @@ struct server_params
    std::string hostname = "127.0.0.1";
    std::string public_path = "examples/server/public";
    std::string request_path = "";
    std::string inference_path = "/inference";
    int32_t port          = 8080;
    int32_t read_timeout  = 600;
@ -133,7 +132,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
    fprintf(stderr, "  --request-path PATH,           [%-7s] Request path for all requests\n", sparams.request_path.c_str());
    fprintf(stderr, "  --inference-path PATH,         [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
    fprintf(stderr, "\n");
 }
@ -184,7 +182,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
        else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
        else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -219,7 +216,7 @@ void check_ffmpeg_availibility() {
 bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
    std::ostringstream cmd_stream;
    std::string converted_filename_temp = temp_filename + "_temp.wav";
-    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
    std::string cmd = cmd_stream.str();
    int status = std::system(cmd.c_str());
@ -647,10 +644,10 @@ int main(int argc, char ** argv) {
        return false;
    });
-    svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
+    svr.Options(sparams.request_path + "/inference", [&](const Request &, Response &){
    });
-    svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
+    svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
        // acquire whisper model mutex lock
        std::lock_guard<std::mutex> lock(whisper_mutex);
@ -677,8 +674,7 @@ int main(int argc, char ** argv) {
        if (sparams.ffmpeg_converter) {
            // if file is not wav, convert to wav
            // write to temporary file
-            const std::string temp_filename_base = std::tmpnam(nullptr);
+            const std::string temp_filename = "whisper_server_temp_file.wav";
            const std::string temp_filename = temp_filename_base + ".wav";
            std::ofstream temp_file{temp_filename, std::ios::binary};
            temp_file << audio_file.content;
            temp_file.close();
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,13 +1,7 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp
+    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
        llama.cpp
        llama-vocab.cpp
        llama-grammar.cpp
        llama-sampling.cpp
        unicode.cpp
        unicode-data.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    if (WHISPER_CLBLAST)
--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
--- a/examples/talk-llama/llama-grammar.h
+++ b/examples/talk-llama/llama-grammar.h
@ -1,144 +0,0 @@
 #pragma once
 #include "llama-impl.h"
 #include <map>
 struct llama_vocab;
 // grammar element type
 enum llama_gretype {
    // end of rule definition
    LLAMA_GRETYPE_END            = 0,
    // start of alternate definition for rule
    LLAMA_GRETYPE_ALT            = 1,
    // non-terminal element: reference to rule
    LLAMA_GRETYPE_RULE_REF       = 2,
    // terminal element: character (code point)
    LLAMA_GRETYPE_CHAR           = 3,
    // inverse char(s) ([^a], [^a-b] [^abc])
    LLAMA_GRETYPE_CHAR_NOT       = 4,
    // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
    // be an inclusive range ([a-z])
    LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
    // modifies a preceding LLAMA_GRETYPE_CHAR or
    // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
    LLAMA_GRETYPE_CHAR_ALT       = 6,
    // any character (.)
    LLAMA_GRETYPE_CHAR_ANY       = 7,
 };
 typedef struct llama_grammar_element {
    enum llama_gretype type;
    uint32_t           value; // Unicode code point or rule ID
 } llama_grammar_element;
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
 using llama_grammar_rule  = std::vector<      llama_grammar_element>;
 using llama_grammar_stack = std::vector<const llama_grammar_element *>;
 using llama_grammar_rules      = std::vector<llama_grammar_rule>;
 using llama_grammar_stacks     = std::vector<llama_grammar_stack>;
 using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
 const llama_grammar_rules  & llama_grammar_get_rules (const struct llama_grammar * grammar);
      llama_grammar_stacks & llama_grammar_get_stacks(      struct llama_grammar * grammar);
 // takes a set of possible pushdown stacks on a grammar, which are required to
 // be positioned at a character range (see `llama_grammar_advance_stack`), and
 // produces the N possible stacks if the given char is accepted at those
 // positions
 void llama_grammar_accept(
        const llama_grammar_rules  & rules,
        const llama_grammar_stacks & stacks,
                          uint32_t   chr,
              llama_grammar_stacks & stacks_new);
 std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
        const llama_grammar_rules      & rules,
        const llama_grammar_stack      & stack,
        const llama_grammar_candidates & candidates);
 struct llama_grammar_parser {
    std::map<std::string, uint32_t> symbol_ids;
    llama_grammar_rules rules;
    llama_grammar_stack c_rules() const;
    uint32_t get_symbol_id(const char * src, size_t len);
    uint32_t generate_symbol_id(const std::string & base_name);
    void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
    const char * parse_alternates(
            const char        * src,
            const std::string & rule_name,
            uint32_t            rule_id,
            bool                is_nested);
    const char * parse_sequence(
            const char         * src,
            const std::string  & rule_name,
            llama_grammar_rule & rule,
            bool               is_nested);
    const char * parse_rule(const char * src);
    bool parse(const char * src);
    void print(FILE * file);
 };
 struct llama_grammar {
    // note: allow null vocab for testing (not great)
    const llama_vocab * vocab;
    const llama_grammar_rules  rules;  // TODO: shared ptr
          llama_grammar_stacks stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8 partial_utf8;
 };
 //
 // internal API
 //
 // note: needed for tests (not great)
 struct llama_grammar * llama_grammar_init_impl(
        const struct llama_vocab * vocab,
        const llama_grammar_element ** rules,
        size_t n_rules,
        size_t start_rule_index);
 struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
 void llama_grammar_free_impl(struct llama_grammar * grammar);
 struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
 // TODO: move the API below as member functions of llama_grammar
 void llama_grammar_apply_impl(
        const struct llama_grammar & grammar,
            llama_token_data_array * cur_p);
 void llama_grammar_accept_impl(
              struct llama_grammar & grammar,
                       llama_token   token);
--- a/examples/talk-llama/llama-impl.h
+++ b/examples/talk-llama/llama-impl.h
@ -1,181 +0,0 @@
 #pragma once
 #include "llama.h"
 #include <string>
 #include <vector>
 #include <stdexcept>
 #ifdef __GNUC__
 #ifdef __MINGW32__
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
 #else
 #define LLAMA_ATTRIBUTE_FORMAT(...)
 #endif
 //
 // logging
 //
 LLAMA_ATTRIBUTE_FORMAT(2, 3)
 void llama_log_internal        (ggml_log_level level, const char * format, ...);
 void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
 #define LLAMA_LOG(...)       llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
 #define LLAMA_LOG_INFO(...)  llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
 #define LLAMA_LOG_WARN(...)  llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
 #define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
 #define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
 #define LLAMA_LOG_CONT(...)  llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
 //
 // helpers
 //
 struct time_meas {
    time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
    ~time_meas() {
        if (t_start_us >= 0) {
            t_acc += ggml_time_us() - t_start_us;
        }
    }
    const int64_t t_start_us;
    int64_t & t_acc;
 };
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    if (search.empty()) {
        return;
    }
    std::string builder;
    builder.reserve(s.length());
    size_t pos = 0;
    size_t last_pos = 0;
    while ((pos = s.find(search, last_pos)) != std::string::npos) {
        builder.append(s, last_pos, pos - last_pos);
        builder.append(replace);
        last_pos = pos + search.length();
    }
    builder.append(s, last_pos, std::string::npos);
    s = std::move(builder);
 }
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );
 // the ring buffer works similarly to std::deque, but with a fixed capacity
 template<typename T>
 struct ring_buffer {
    ring_buffer(size_t cap) : capacity(cap), data(cap) {}
    T & front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    const T & front() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[first];
    }
    T & back() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    const T & back() const {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        return data[pos];
    }
    void push_back(const T & value) {
        if (capacity == 0) {
            throw std::runtime_error("ring buffer: capacity is zero");
        }
        if (sz == capacity) {
            // advance the start when buffer is full
            first = (first + 1) % capacity;
        } else {
            sz++;
        }
        data[pos] = value;
        pos = (pos + 1) % capacity;
    }
    T pop_front() {
        if (sz == 0) {
            throw std::runtime_error("ring buffer is empty");
        }
        T value = data[first];
        first = (first + 1) % capacity;
        sz--;
        return value;
    }
    //T & operator[](size_t i) {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    //const T & at(size_t i) const {
    //    if (i >= sz) {
    //        throw std::runtime_error("ring buffer: index out of bounds");
    //    }
    //    return data[(first + i) % capacity];
    //}
    const T & rat(size_t i) const {
        if (i >= sz) {
            throw std::runtime_error("ring buffer: index out of bounds");
        }
        return data[(first + sz - i - 1) % capacity];
    }
    std::vector<T> to_vector() const {
        std::vector<T> result;
        result.reserve(sz);
        for (size_t i = 0; i < sz; i++) {
            result.push_back(data[(first + i) % capacity]);
        }
        return result;
    }
    void clear() {
        // here only reset the status of the buffer
        sz = 0;
        first = 0;
        pos = 0;
    }
    bool empty() const {
        return sz == 0;
    }
    size_t size() const {
        return sz;
    }
    size_t capacity = 0;
    size_t sz = 0;
    size_t first = 0;
    size_t pos = 0;
    std::vector<T> data;
 };
--- a/examples/talk-llama/llama-sampling.cpp
+++ b/examples/talk-llama/llama-sampling.cpp
--- a/examples/talk-llama/llama-sampling.h
+++ b/examples/talk-llama/llama-sampling.h
@ -1,29 +0,0 @@
 #pragma once
 // TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
 #include "llama-grammar.h"
 #include <unordered_map>
 struct llama_vocab;
 struct llama_grammar;
 // sampler chain
 struct llama_sampler_chain {
    llama_sampler_chain_params params;
    std::vector<struct llama_sampler *> samplers;
    // timing
    mutable int64_t t_sample_us;
    mutable int32_t n_sample;
 };
 struct llama_sampler * llama_sampler_init_grammar_impl(
        const struct llama_vocab & vocab,
                      const char * grammar_str,
                      const char * grammar_root);
--- a/examples/talk-llama/llama-vocab.cpp
+++ b/examples/talk-llama/llama-vocab.cpp
--- a/examples/talk-llama/llama-vocab.h
+++ b/examples/talk-llama/llama-vocab.h
@ -1,146 +0,0 @@
 #pragma once
 #include "llama-impl.h"
 #include <string>
 #include <vector>
 #include <unordered_map>
 #include <map>
 #include <set>
 struct llm_tokenizer;
 struct llama_vocab {
    using id    = llama_token;
    using token = std::string;
    using tattr = llama_token_attr;
    struct token_data {
        token text;
        float score;
        tattr attr;
    };
    uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
    int max_token_len = 0; // used for optimizing longest token search
    std::unordered_map<token, id> token_to_id;
    std::vector<token_data>       id_to_token;
    std::vector<id>    cache_special_tokens;
    std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
    std::map<std::pair<std::string, std::string>, int> bpe_ranks;
    // default LLaMA special tokens
    id special_bos_id  = 1;
    id special_eos_id  = 2;
    id special_unk_id  = 0;
    id special_sep_id  = -1;
    id special_pad_id  = -1;
    id special_cls_id  = -1;
    id special_mask_id = -1;
    id linefeed_id       = 13;
    id special_prefix_id = -1;
    id special_suffix_id = -1;
    id special_middle_id = -1;
    id special_eot_id    = -1; // TODO: move above after "eos_id", and here add "file separator" token
    id special_eom_id    = -1;
    // set of all tokens that cause "end of generation"
    std::set<id> special_eog_ids;
    // tokenizer flags
    bool tokenizer_add_space_prefix           = false;
    bool tokenizer_add_bos                    = false;
    bool tokenizer_add_eos                    = false;
    bool tokenizer_ignore_merges              = false;
    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
    bool tokenizer_remove_extra_whitespaces   = false;
    bool tokenizer_escape_whitespaces         = true;
    bool tokenizer_treat_whitespace_as_suffix = false;
    std::vector<char> precompiled_charsmap;
    llm_tokenizer * tokenizer = nullptr;
    llama_vocab() = default;
    ~llama_vocab();
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
    void init_tokenizer();
 };
 //
 // internal API
 //
 // TODO: rename to llama_tokenize_impl
 // TODO: This should probably be in llama.h
 std::vector<llama_vocab::id> llama_tokenize_internal(
        const llama_vocab & vocab,
        std::string raw_text,
        bool add_special,
        bool parse_special = false);
 // TODO: move the API below as member functions of llama_vocab
 llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
 const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
 float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
 llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
 bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
 bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
 llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
 llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
 llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
 llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
 llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
 bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
 bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
 llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
 llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
 llama_token llama_token_eot_impl   (const struct llama_vocab & vocab);
 llama_token llama_token_eom_impl   (const struct llama_vocab & vocab);
 int32_t llama_tokenize_impl(
        const struct llama_vocab & vocab,
                      const char * text,
                         int32_t   text_len,
                     llama_token * tokens,
                         int32_t   n_tokens_max,
                            bool   add_special,
                            bool   parse_special);
 // does not write null-terminator to buf
 int32_t llama_token_to_piece_impl(
        const struct llama_vocab & vocab,
                     llama_token   token,
                            char * buf,
                         int32_t   length,
                         int32_t   lstrip,
                            bool   special);
 int32_t llama_detokenize_impl(
        const struct llama_vocab & vocab,
               const llama_token * tokens,
                         int32_t   n_tokens,
                            char * text,
                         int32_t   text_len_max,
                            bool   remove_special,
                            bool   unparse_special);
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -33,18 +33,17 @@
 #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
-// TODO: use everywhere in the implementation
+#define LLAMA_MAX_RNG_STATE (64*1024)
 #define LLAMA_TOKEN_NULL -1
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 9
+#define LLAMA_SESSION_VERSION 6
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
-#define LLAMA_STATE_SEQ_VERSION 2
+#define LLAMA_STATE_SEQ_VERSION 1
 #ifdef __cplusplus
 extern "C" {
@ -56,10 +55,8 @@ extern "C" {
    // TODO: show sample usage
    //
    // struct llama_vocab; // TODO: add in the future
    struct llama_model;
    struct llama_context;
    struct llama_sampler;
    typedef int32_t llama_pos;
    typedef int32_t llama_token;
@ -70,8 +67,6 @@ extern "C" {
        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
        LLAMA_VOCAB_TYPE_UGM  = 4, // T5 tokenizer based on Unigram
        LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
    };
    // pre-tokenization types
@ -92,23 +87,15 @@ extern "C" {
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
        LLAMA_VOCAB_PRE_TYPE_CHATGLM3       = 16,
        LLAMA_VOCAB_PRE_TYPE_CHATGLM4       = 17,
        LLAMA_VOCAB_PRE_TYPE_VIKING         = 18,
        LLAMA_VOCAB_PRE_TYPE_JAIS           = 19,
        LLAMA_VOCAB_PRE_TYPE_TEKKEN         = 20,
        LLAMA_VOCAB_PRE_TYPE_SMOLLM         = 21,
        LLAMA_VOCAB_PRE_TYPE_CODESHELL      = 22,
        LLAMA_VOCAB_PRE_TYPE_BLOOM          = 23,
        LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH   = 24,
        LLAMA_VOCAB_PRE_TYPE_EXAONE         = 25,
        LLAMA_VOCAB_PRE_TYPE_CHAMELEON      = 26,
    };
    // note: these values should be synchronized with ggml_rope
    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
    enum llama_rope_type {
        LLAMA_ROPE_TYPE_NONE = -1,
-        LLAMA_ROPE_TYPE_NORM = 0,
+        LLAMA_ROPE_TYPE_NORM =  0,
-        LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
+        LLAMA_ROPE_TYPE_NEOX =  2,
        LLAMA_ROPE_TYPE_GLM  =  4,
    };
    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@ -141,7 +128,7 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
-        // LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
@ -170,11 +157,6 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_4_4      = 33, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_4_8      = 34, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0_8_8      = 35, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ1_0         = 36, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_TQ2_0         = 37, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
@ -193,22 +175,14 @@ extern "C" {
        LLAMA_POOLING_TYPE_MEAN = 1,
        LLAMA_POOLING_TYPE_CLS  = 2,
        LLAMA_POOLING_TYPE_LAST = 3,
        LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
    };
    enum llama_attention_type {
        LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
        LLAMA_ATTENTION_TYPE_CAUSAL      = 0,
        LLAMA_ATTENTION_TYPE_NON_CAUSAL  = 1,
    };
    enum llama_split_mode {
-        LLAMA_SPLIT_MODE_NONE  = 0, // single GPU
+        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_MODE_ROW   = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
    };
    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
    typedef struct llama_token_data {
        llama_token id; // token id
        float logit;    // log-odds of the token
@ -216,10 +190,8 @@ extern "C" {
    } llama_token_data;
    typedef struct llama_token_data_array {
        // TODO: consider SoA
        llama_token_data * data;
        size_t size;
        int64_t selected; // this is the index in the data array (i.e. not the token id)
        bool sorted;
    } llama_token_data_array;
@ -280,9 +252,9 @@ extern "C" {
        enum llama_split_mode split_mode; // how to split the model across multiple GPUs
        // main_gpu interpretation depends on split_mode:
-        // LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
+        // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
-        // LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
+        // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
-        // LLAMA_SPLIT_MODE_LAYER: ignored
+        // LLAMA_SPLIT_LAYER: ignored
        int32_t main_gpu;
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@ -312,16 +284,16 @@ extern "C" {
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggerganov/llama.cpp/pull/7544
    struct llama_context_params {
        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
        uint32_t n_ubatch;          // physical maximum batch size
        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
-        int32_t  n_threads;         // number of threads to use for generation
+        uint32_t n_threads;         // number of threads to use for generation
-        int32_t  n_threads_batch;   // number of threads to use for batch processing
+        uint32_t n_threads_batch;   // number of threads to use for batch processing
        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
        enum llama_attention_type    attention_type;    // attention type to use for embeddings
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
@ -339,13 +311,11 @@ extern "C" {
        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
-        // Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
+        // Keep the booleans together to avoid misalignment during copy-by-value.
        // TODO: move at the end of the struct
        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
        bool embeddings;  // if true, extract embeddings (together with logits)
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
@ -359,7 +329,7 @@ extern "C" {
        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
        enum llama_ftype ftype;              // quantize to this llama_ftype
        enum ggml_type output_tensor_type;   // output tensor type
-        enum ggml_type token_embedding_type; // token embeddings tensor type
+        enum ggml_type token_embedding_type; // itoken embeddings tensor type
        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
        bool quantize_output_tensor;         // quantize output.weight
        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@ -369,14 +339,56 @@ extern "C" {
        void * kv_overrides;                 // pointer to vector containing overrides
    } llama_model_quantize_params;
-    typedef struct llama_logit_bias {
+    // grammar types
-        llama_token token;
+    struct llama_grammar;
        float bias;
    } llama_logit_bias;
-    typedef struct llama_sampler_chain_params {
+    // grammar element type
-        bool no_perf; // whether to measure performance timings
+    enum llama_gretype {
-    } llama_sampler_chain_params;
+        // end of rule definition
        LLAMA_GRETYPE_END            = 0,
        // start of alternate definition for rule
        LLAMA_GRETYPE_ALT            = 1,
        // non-terminal element: reference to rule
        LLAMA_GRETYPE_RULE_REF       = 2,
        // terminal element: character (code point)
        LLAMA_GRETYPE_CHAR           = 3,
        // inverse char(s) ([^a], [^a-b] [^abc])
        LLAMA_GRETYPE_CHAR_NOT       = 4,
        // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
        // be an inclusive range ([a-z])
        LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
        // modifies a preceding LLAMA_GRETYPE_CHAR or
        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
        LLAMA_GRETYPE_CHAR_ALT       = 6,
        // any character (.)
        LLAMA_GRETYPE_CHAR_ANY       = 7,
    };
    typedef struct llama_grammar_element {
        enum llama_gretype type;
        uint32_t           value; // Unicode code point or rule ID
    } llama_grammar_element;
    // performance timing information
    struct llama_timings {
        double t_start_ms;
        double t_end_ms;
        double t_load_ms;
        double t_sample_ms;
        double t_p_eval_ms;
        double t_eval_ms;
        int32_t n_sample;
        int32_t n_p_eval;
        int32_t n_eval;
    };
    // used in chat template
    typedef struct llama_chat_message {
@ -384,14 +396,9 @@ extern "C" {
        const char * content;
    } llama_chat_message;
    // lora adapter
    struct llama_lora_adapter;
    // Helpers for getting default parameters
-    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
+    LLAMA_API struct llama_model_params llama_model_default_params(void);
-    LLAMA_API struct llama_model_params          llama_model_default_params(void);
+    LLAMA_API struct llama_context_params llama_context_default_params(void);
    LLAMA_API struct llama_context_params        llama_context_default_params(void);
    LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
    LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
    // Initialize the llama + ggml backend
@ -402,23 +409,15 @@ extern "C" {
    //optional:
    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
    // Optional: an auto threadpool gets created in ggml if not passed explicitly
    LLAMA_API void llama_attach_threadpool(
               struct   llama_context * ctx,
            ggml_threadpool_t   threadpool,
            ggml_threadpool_t   threadpool_batch);
    LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);
    LLAMA_API struct llama_model * llama_load_model_from_file(
                             const char * path_model,
-              struct llama_model_params   params);
+            struct llama_model_params     params);
    LLAMA_API void llama_free_model(struct llama_model * model);
    // TODO: rename to llama_init_from_model
    LLAMA_API struct llama_context * llama_new_context_with_model(
                     struct llama_model * model,
            struct llama_context_params   params);
@ -434,22 +433,22 @@ extern "C" {
    LLAMA_API bool llama_supports_mlock      (void);
    LLAMA_API bool llama_supports_gpu_offload(void);
    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_head     (const struct llama_model * model);
    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ -483,51 +482,24 @@ extern "C" {
    // Get a llama model tensor
    LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
    // Returns true if the model contains an encoder that requires llama_encode() call
    LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
    // Returns true if the model contains a decoder that requires llama_decode() call
    LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
    // For encoder-decoder models, this function returns id of the token that must be provided
    // to the decoder to start generating output sequence. For other models, it returns -1.
    LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
    // Returns true if the model is recurrent (like Mamba, RWKV, etc.)
    LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
    // Returns 0 on success
    LLAMA_API uint32_t llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
            const llama_model_quantize_params * params);
-    // Load a LoRA adapter from file
+    // Apply a LoRA adapter to a loaded model
-    // The loaded adapter will be associated to the given model, and will be free when the model is deleted
+    // path_base_model is the path to a higher quality model to use as a base for
-    LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-            struct llama_model * model,
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-            const char * path_lora);
+    // will be applied on top of the previous one
-
+    // Returns 0 on success
-    // Add a loaded LoRA adapter to given context
+    LLAMA_API int32_t llama_model_apply_lora_from_file(
-    // This will not modify model's weight
+            const struct llama_model * model,
-    LLAMA_API int32_t llama_lora_adapter_set(
+                          const char * path_lora,
-            struct llama_context * ctx,
+                               float   scale,
-            struct llama_lora_adapter * adapter,
+                          const char * path_base_model,
-            float scale);
+                             int32_t   n_threads);
    // Remove a specific LoRA adapter from given context
    // Return -1 if the adapter is not present in the context
    LLAMA_API int32_t llama_lora_adapter_remove(
            struct llama_context * ctx,
            struct llama_lora_adapter * adapter);
    // Remove all LoRA adapters from given context
    LLAMA_API void llama_lora_adapter_clear(
            struct llama_context * ctx);
    // Manually free a LoRA adapter
    // Note: loaded adapters will be free when the associated model is deleted
    LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
    // the currently loaded vector.
@ -677,11 +649,10 @@ extern "C" {
    // State / sessions
    //
-    // Returns the *actual* size in bytes of the state
+    // Returns the maximum size in bytes of the state (rng, logits, embedding
-    // (logits, embedding and kv_cache)
+    // and kv_cache) - will often be smaller after compacting tokens
-    // Only use when saving the state, not when restoring it, otherwise the size may be too small.
+    LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
-    LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
+    LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
    LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
        "use llama_state_get_size instead");
    // Copies the state to the specified destination address.
@ -689,8 +660,7 @@ extern "C" {
    // Returns the number of bytes copied
    LLAMA_API size_t llama_state_get_data(
            struct llama_context * ctx,
-                         uint8_t * dst,
+                         uint8_t * dst);
                          size_t   size);
    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
            struct llama_context * ctx,
                         uint8_t * dst),
@ -700,8 +670,7 @@ extern "C" {
    // Returns the number of bytes read
    LLAMA_API size_t llama_state_set_data(
            struct llama_context * ctx,
-                   const uint8_t * src,
+                   const uint8_t * src);
                          size_t   size);
    LLAMA_API DEPRECATED(size_t llama_set_state_data(
            struct llama_context * ctx,
                   const uint8_t * src),
@ -743,7 +712,6 @@ extern "C" {
    LLAMA_API size_t llama_state_seq_get_data(
            struct llama_context * ctx,
                         uint8_t * dst,
                          size_t   size,
                    llama_seq_id   seq_id);
    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@ -753,7 +721,6 @@ extern "C" {
    LLAMA_API size_t llama_state_seq_set_data(
            struct llama_context * ctx,
                   const uint8_t * src,
                          size_t   size,
                    llama_seq_id   dest_seq_id);
    LLAMA_API size_t llama_state_seq_save_file(
@ -800,14 +767,6 @@ extern "C" {
    // Frees a batch of tokens allocated with llama_batch_init()
    LLAMA_API void llama_batch_free(struct llama_batch batch);
    // Processes a batch of tokens with the ecoder part of the encoder-decoder model.
    // Stores the encoder output internally for later use by the decoder cross-attention layers.
    //   0 - success
    // < 0 - error
    LLAMA_API int32_t llama_encode(
            struct llama_context * ctx,
              struct llama_batch   batch);
    // Positive return values does not mean a fatal error, but rather a warning.
    //   0 - success
    //   1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@ -819,13 +778,13 @@ extern "C" {
    // Set the number of threads used for decoding
    // n_threads is the number of threads used for generation (single token)
    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
-    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
+    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
    // Get the number of threads used for generation of a single token.
-    LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
    // Get the number of threads used for prompt and batch processing (multiple token).
-    LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
    // Set whether the model is in embeddings mode or not
    // If true, embeddings will be returned but logits will not
@ -873,8 +832,7 @@ extern "C" {
    // Get the embeddings for a sequence id
    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
-    // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
+    // shape: [n_embd] (1-dimensional)
    // otherwise: float[n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
    //
@ -899,10 +857,12 @@ extern "C" {
    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
-    LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
+    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
+    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
    // Returns -1 if unknown, 1 for true or 0 for false.
    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
    // Codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@ -913,14 +873,11 @@ extern "C" {
    //
    // Tokenization
    //
    // The API is thread-safe.
    //
    /// @details Convert the provided text into tokens.
    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
    /// @return Returns the number of tokens on success, no more than n_tokens_max
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
    /// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
    ///                      as plaintext. Does not insert a leading space.
    LLAMA_API int32_t llama_tokenize(
@ -935,35 +892,15 @@ extern "C" {
    // Token Id -> Piece.
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
-    // User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
+    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
    // @param special If true, special tokens are rendered in the output.
    LLAMA_API int32_t llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
                               int32_t   length,
                               int32_t   lstrip,
                                  bool   special);
    /// @details Convert the provided tokens into text (inverse of llama_tokenize()).
    /// @param text The char pointer must be large enough to hold the resulting text.
    /// @return Returns the number of chars/bytes on success, no more than text_len_max.
    /// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
    /// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
    /// @param unparse_special If true, special tokens are rendered in the output.
    LLAMA_API int32_t llama_detokenize(
        const struct llama_model * model,
               const llama_token * tokens,
                         int32_t   n_tokens,
                            char * text,
                         int32_t   text_len_max,
                            bool   remove_special,
                            bool   unparse_special);
    //
    // Chat templates
    //
    /// Apply chat template. Inspired by hf apply_chat_template() on python.
    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@ -984,114 +921,104 @@ extern "C" {
                               int32_t   length);
    //
-    // Sampling API
+    // Grammar
    //
    // Sample usage:
    //
    //    // prepare the sampling chain at the start
    //    auto sparams = llama_sampler_chain_default_params();
    //
    //    llama_sampler * smpl = llama_sampler_chain_init(sparams);
    //
    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
    //    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
    //    llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
    //
    //    // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
    //    // this sampler will be responsible to select the actual token
    //    llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
    //
    //    ...
    //
    //    // decoding loop:
    //    while (...) {
    //        ...
    //
    //        llama_decode(ctx, batch);
    //
    //        // sample from the logits of the last token in the batch
    //        const llama_token id = llama_sampler_sample(smpl, ctx, -1);
    //
    //        // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
    //        llama_sampler_accept(smpl, id);
    //        ...
    //    }
    //
    //    llama_sampler_free(smpl);
    //
    // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
    // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
    //
-    typedef void * llama_sampler_context_t;
+    LLAMA_API struct llama_grammar * llama_grammar_init(
            const llama_grammar_element ** rules,
                                 size_t    n_rules,
                                 size_t    start_rule_index);
-    // user code can implement the interface below in order to create custom llama_sampler
+    LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
    struct llama_sampler_i {
        const char *           (*name)  (const struct llama_sampler * smpl);                                 // can be NULL
        void                   (*accept)(      struct llama_sampler * smpl, llama_token token);              // can be NULL
        void                   (*apply) (      struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
        void                   (*reset) (      struct llama_sampler * smpl);                                 // can be NULL
        struct llama_sampler * (*clone) (const struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
        void                   (*free)  (      struct llama_sampler * smpl);                                 // can be NULL if ctx is NULL
-        // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
+    LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
        //void (*apply_ggml) (struct llama_sampler * smpl, ...);
    };
-    struct llama_sampler {
+    //
-        struct llama_sampler_i  * iface;
+    // Sampling functions
-        llama_sampler_context_t   ctx;
+    //
    };
-    // mirror of llama_sampler_i:
+    // Sets the current rng seed.
-    LLAMA_API const char *           llama_sampler_name  (const struct llama_sampler * smpl);
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
    LLAMA_API void                   llama_sampler_accept(      struct llama_sampler * smpl, llama_token token);
    LLAMA_API void                   llama_sampler_apply (      struct llama_sampler * smpl, llama_token_data_array * cur_p);
    LLAMA_API void                   llama_sampler_reset (      struct llama_sampler * smpl);
    LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
    // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
    LLAMA_API void                   llama_sampler_free  (      struct llama_sampler * smpl);
-    // llama_sampler_chain
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-    // a type of llama_sampler that can chain multiple samplers one after another
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
    LLAMA_API void llama_sample_repetition_penalties(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
               const llama_token * last_tokens,
                          size_t   penalty_last_n,
                           float   penalty_repeat,
                           float   penalty_freq,
                           float   penalty_present);
-    LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
+    /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
-
+    /// @param logits Logits extracted from the original generation context.
-    // important: takes ownership of the sampler object and will free it when llama_sampler_free is called
+    /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
-    LLAMA_API void                   llama_sampler_chain_add(      struct llama_sampler * chain, struct llama_sampler * smpl);
+    /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
-    LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
+    LLAMA_API void llama_sample_apply_guidance(
-    LLAMA_API int                    llama_sampler_chain_n  (const struct llama_sampler * chain);
+              struct llama_context * ctx,
-
+                             float * logits,
-    // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
+                             float * logits_guidance,
-    LLAMA_API struct llama_sampler * llama_sampler_chain_remove(   struct llama_sampler * chain, int32_t i);
+                             float   scale);
    // available samplers:
    LLAMA_API struct llama_sampler * llama_sampler_init_greedy     (void);
    LLAMA_API struct llama_sampler * llama_sampler_init_dist       (uint32_t seed);
    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
-    /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
+    LLAMA_API void llama_sample_softmax(
-    LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void);
+            struct llama_context * ctx,
          llama_token_data_array * candidates);
    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
+    LLAMA_API void llama_sample_top_k(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                         int32_t   k,
                          size_t   min_keep);
    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
-    LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, size_t min_keep);
+    LLAMA_API void llama_sample_top_p(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   p,
                          size_t   min_keep);
    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
-    LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
+    LLAMA_API void llama_sample_min_p(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   p,
                          size_t   min_keep);
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
-    LLAMA_API struct llama_sampler * llama_sampler_init_tail_free  (float   z, size_t min_keep);
+    LLAMA_API void llama_sample_tail_free(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   z,
                          size_t   min_keep);
    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
-    LLAMA_API struct llama_sampler * llama_sampler_init_typical    (float   p, size_t min_keep);
+    LLAMA_API void llama_sample_typical(
-    LLAMA_API struct llama_sampler * llama_sampler_init_temp       (float   t);
+            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   p,
                          size_t   min_keep);
-    /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
+    /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
-    LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext   (float   t, float   delta, float exponent);
+    LLAMA_API void llama_sample_entropy(
            struct llama_context * ctx,
          llama_token_data_array * candidates_p,
                           float   min_temp,
                           float   max_temp,
                           float   exponent_val);
    LLAMA_API void llama_sample_temp(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
                           float   temp);
    /// @details Apply constraints from grammar
    LLAMA_API void llama_sample_grammar(
            struct llama_context * ctx,
          llama_token_data_array * candidates,
      const struct llama_grammar * grammar);
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -1099,62 +1026,42 @@ extern "C" {
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
+    LLAMA_API llama_token llama_sample_token_mirostat(
-                             int32_t   n_vocab,
+            struct llama_context * ctx,
-                            uint32_t   seed,
+          llama_token_data_array * candidates,
-                               float   tau,
+                           float   tau,
-                               float   eta,
+                           float   eta,
-                             int32_t   m);
+                         int32_t   m,
                           float * mu);
    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
-    LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(
-                            uint32_t   seed,
+            struct llama_context * ctx,
-                               float   tau,
+          llama_token_data_array * candidates,
-                               float   eta);
+                           float   tau,
                           float   eta,
                           float * mu);
-    LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
+    /// @details Selects the token with the highest probability.
-            const struct llama_model * model,
+    ///          Does not compute the token probabilities. Use llama_sample_softmax() instead.
-                          const char * grammar_str,
+    LLAMA_API llama_token llama_sample_token_greedy(
-                          const char * grammar_root);
+            struct llama_context * ctx,
          llama_token_data_array * candidates);
-    LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
+    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
-                             int32_t   n_vocab,         // llama_n_vocab()
+    LLAMA_API llama_token llama_sample_token(
-                         llama_token   special_eos_id,  // llama_token_eos()
+            struct llama_context * ctx,
-                         llama_token   linefeed_id,     // llama_token_nl()
+          llama_token_data_array * candidates);
                             int32_t   penalty_last_n,  // last n tokens to penalize (0 = disable penalty, -1 = context size)
                               float   penalty_repeat,  // 1.0 = disabled
                               float   penalty_freq,    // 0.0 = disabled
                               float   penalty_present, // 0.0 = disabled
                                bool   penalize_nl,     // consider newlines as a repeatable token
                                bool   ignore_eos);     // ignore the end-of-sequence token
-    LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
+    /// @details Accepts the sampled token into the grammar
-                             int32_t   n_vocab,
+    LLAMA_API void llama_grammar_accept_token(
-                             int32_t   n_logit_bias,
+            struct llama_context * ctx,
-              const llama_logit_bias * logit_bias);
+            struct llama_grammar * grammar,
-
+                     llama_token   token);
    // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
    LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
    /// @details Sample and accept a token from the idx-th output of the last evaluation
    //
    // Shorthand for:
    //    const auto * logits = llama_get_logits_ith(ctx, idx);
    //    llama_token_data_array cur_p = { ... init from logits ... };
    //    llama_sampler_apply(smpl, &cur_p);
    //    auto token = cur_p.data[cur_p.selected].id;
    //    llama_sampler_accept(smpl, token);
    //    return token;
    // Returns the sampled token
    LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
    // TODO: extend in the future
    //LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
    //
    // Model split
@ -1170,6 +1077,12 @@ extern "C" {
    //  Returns the split_prefix length.
    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
@ -1177,41 +1090,58 @@ extern "C" {
    // If this is not called, or NULL is supplied, everything is output on stderr.
    LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
-    //
+    LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
    // Performance utils
    //
    // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
    //
    struct llama_perf_context_data {
        double t_start_ms;
        double t_load_ms;
        double t_p_eval_ms;
        double t_eval_ms;
        int32_t n_p_eval;
        int32_t n_eval;
    };
    struct llama_perf_sampler_data {
        double t_sample_ms;
        int32_t n_sample;
    };
    LLAMA_API struct llama_perf_context_data llama_perf_context      (const struct llama_context * ctx);
    LLAMA_API void                           llama_perf_context_print(const struct llama_context * ctx);
    LLAMA_API void                           llama_perf_context_reset(      struct llama_context * ctx);
    // NOTE: the following work only with samplers constructed via llama_sampler_chain_init
    LLAMA_API struct llama_perf_sampler_data llama_perf_sampler      (const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
    LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
    LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
 #ifdef __cplusplus
 }
 #endif
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 #include <random>
 #include <string>
 #include <vector>
 struct ggml_tensor;
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar {
    const std::vector<std::vector<llama_grammar_element>>   rules;
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8                                      partial_utf8;
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );
 void llama_grammar_accept(
        const std::vector<std::vector<llama_grammar_element>>         & rules,
        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
        const uint32_t                                                  chr,
        std::vector<std::vector<const llama_grammar_element *>>       & new_stacks);
 std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
        const std::string & src,
        llama_partial_utf8   partial_start);
 // Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
 // This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
 llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
 #endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
 static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -314,6 +314,7 @@ int main(int argc, char ** argv) {
    // tune these to your liking
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
    lcparams.n_threads  = params.n_threads;
    lcparams.flash_attn = params.flash_attn;
@ -401,26 +402,6 @@ int main(int argc, char ** argv) {
    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
    // init sampler
    const float top_k = 5;
    const float top_p = 0.80f;
    const float temp  = 0.30f;
    const int seed = 0;
    auto sparams = llama_sampler_chain_default_params();
    llama_sampler * smpl = llama_sampler_chain_init(sparams);
    if (temp > 0.0f) {
        llama_sampler_chain_add(smpl, llama_sampler_init_top_k(top_k));
        llama_sampler_chain_add(smpl, llama_sampler_init_top_p(top_p, 1));
        llama_sampler_chain_add(smpl, llama_sampler_init_temp (temp));
        llama_sampler_chain_add(smpl, llama_sampler_init_dist (seed));
    } else {
        llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
    }
    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
@ -436,7 +417,7 @@ int main(int argc, char ** argv) {
            session_tokens.resize(llama_n_ctx(ctx_llama));
            size_t n_token_count_out = 0;
-            if (!llama_state_load_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
                return 1;
            }
@ -719,13 +700,54 @@ int main(int argc, char ** argv) {
                    {
                        // out of user input, sample next token
                        const float top_k          = 5;
                        const float top_p          = 0.80f;
                        const float temp           = 0.30f;
                        const float repeat_penalty = 1.1764f;
                        const int repeat_last_n    = 256;
                        if (!path_session.empty() && need_to_save_session) {
                            need_to_save_session = false;
-                            llama_state_save_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
                        }
-                        const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
+                        llama_token id = 0;
                        {
                            auto logits = llama_get_logits(ctx_llama);
                            auto n_vocab = llama_n_vocab(model_llama);
                            logits[llama_token_eos(model_llama)] = 0;
                            std::vector<llama_token_data> candidates;
                            candidates.reserve(n_vocab);
                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                            }
                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
                            // apply repeat penalty
                            const float nl_logit = logits[llama_token_nl(model_llama)];
                            llama_sample_repetition_penalties(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
                                    repeat_last_n, repeat_penalty, 0.0, 0.0f);
                            logits[llama_token_nl(model_llama)] = nl_logit;
                            if (temp <= 0) {
                                // Greedy sampling
                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
                            } else {
                                // Temperature sampling
                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
                                llama_sample_temp (ctx_llama, &candidates_p, temp);
                                id = llama_sample_token(ctx_llama, &candidates_p);
                            }
                        }
                        if (id != llama_token_eos(model_llama)) {
                            // add it to the context
@ -775,14 +797,8 @@ int main(int argc, char ** argv) {
    whisper_print_timings(ctx_wsp);
    whisper_free(ctx_wsp);
-    llama_perf_sampler_print(smpl);
+    llama_print_timings(ctx_llama);
    llama_perf_context_print(ctx_llama);
    llama_sampler_free(smpl);
    llama_batch_free(batch);
    llama_free(ctx_llama);
    llama_backend_free();
    return 0;
 }
--- a/examples/talk-llama/unicode-data.cpp
+++ b/examples/talk-llama/unicode-data.cpp
@ -7,7 +7,7 @@
 #include <unordered_map>
 #include <unordered_set>
-const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
+const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = {  // start, flags // last=next_start-1
 {0x000000, 0x0080},
 {0x000020, 0x0008},
 {0x000021, 0x0020},
@ -2311,8 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
 0x003000,
 };
-// list is always in ascending order, to enable binary searh
+const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
 {0x000041, 0x000061},
 {0x000042, 0x000062},
 {0x000043, 0x000063},
@ -3748,8 +3747,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
 {0x01E921, 0x01E943},
 };
-// list is always in ascending order, to enable binary searh
+const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
 const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
 {0x000061, 0x000041},
 {0x000062, 0x000042},
 {0x000063, 0x000043},
@ -5202,7 +5200,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase
 {0x01E943, 0x01E921},
 };
-const std::initializer_list<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
+const std::vector<range_nfd> unicode_ranges_nfd = {  // start, last, nfd
 {0x000000, 0x000000, 0x000000},
 {0x0000C0, 0x0000C5, 0x000041},
 {0x0000C7, 0x0000C7, 0x000043},
@ -7032,3 +7030,4 @@ const std::initializer_list<range_nfd> unicode_ranges_nfd = {  // start, last, n
 {0x02FA1C, 0x02FA1C, 0x009F3B},
 {0x02FA1D, 0x02FA1D, 0x02A600},
 };
--- a/examples/talk-llama/unicode-data.h
+++ b/examples/talk-llama/unicode-data.h
@ -13,8 +13,8 @@ struct range_nfd {
 static const uint32_t MAX_CODEPOINTS = 0x110000;
-extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
 extern const std::unordered_set<uint32_t> unicode_set_whitespace;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
-extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
-extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
+extern const std::vector<range_nfd> unicode_ranges_nfd;
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@ -1,11 +1,6 @@
 #if defined(_MSC_VER)
 #define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
 #endif
 #include "unicode.h"
 #include "unicode-data.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@ -20,12 +15,6 @@
 #include <locale>
 #include <codecvt>
 size_t unicode_len_utf8(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
 }
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
@ -34,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    return result;
 }
-uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];
@ -123,11 +112,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
 static std::vector<codepoint_flags> unicode_cpt_flags_array() {
    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
-    assert (unicode_ranges_flags.begin()[0].first == 0);
+    assert (unicode_ranges_flags.front().first == 0);
-    assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
+    assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
    for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
-        const auto range_ini = unicode_ranges_flags.begin()[i-1];  // codepoint_ini, flags
+        const auto range_ini = unicode_ranges_flags[i-1];  // codepoint_ini, flags
-        const auto range_end = unicode_ranges_flags.begin()[i];    // codepoint_end, flags
+        const auto range_end = unicode_ranges_flags[i];    // codepoint_end, flags
        for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
            cpt_flags[cpt] = range_ini.second;
        }
@ -243,7 +232,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
@ -305,9 +295,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+
-            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                pos += (cpt == ' ');
-                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
@ -361,7 +351,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
-            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
+            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
@ -403,8 +394,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
                }
            }
-            // regex: [^\r\n\p{L}\p{N}]?\p{L}+
+            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
-            if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
+            if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
                if (flags.is_letter || _get_flags(pos+1).is_letter) {  // one or more letters
                    pos++;
                    while (_get_flags(pos).is_letter) {
@ -430,9 +421,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
-            if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
+            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                pos += (cpt == ' ');
-                while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
+                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                    flags2 = _get_flags(++pos);
                }
                uint32_t cpt2 = _get_cpt(pos);
@ -597,7 +588,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
    std::vector<uint32_t> result(cpts.size());
    for (size_t i = 0; i < cpts.size(); ++i) {
        const uint32_t cpt = cpts[i];
-        auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
+        auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
        result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
    }
    return result;
@ -639,15 +630,8 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
 }
 uint32_t unicode_tolower(uint32_t cp) {
-    // binary search
+    auto it = unicode_map_lowercase.find(cp);
-    auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
+    return it == unicode_map_lowercase.end() ? cp : it->second;
        [](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
            return pair.first < value;
        });
    if (it != unicode_map_lowercase.end() && it->first == cp) {
        return it->second;
    }
    return cp;  // Return the original code point if no lowercase mapping is found
 }
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@ -4,8 +4,6 @@
 #include <string>
 #include <vector>
 // TODO: prefix all symbols with "llama_"
 struct codepoint_flags {
    enum {
        UNDEFINED       = 0x0001,
@ -48,10 +46,8 @@ struct codepoint_flags {
    }
 };
 size_t unicode_len_utf8(char src);
 std::string unicode_cpt_to_utf8(uint32_t cp);
 uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@ -21,7 +21,7 @@ help()
    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
    echo "options:"
    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' 'large-v3-turbo' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
    echo "-t       Number of threads to use."
    echo "-h       Print this help page."
    echo
--- a/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android.java/app/src/main/jni/whisper/CMakeLists.txt
@ -7,9 +7,8 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
 set(SOURCE_FILES
    ${WHISPER_LIB_DIR}/ggml/src/ggml.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
-    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
+    ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
    ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
    ${WHISPER_LIB_DIR}/src/whisper.cpp
    ${CMAKE_SOURCE_DIR}/jni.c
--- a/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/lib/src/main/jni/whisper/CMakeLists.txt
@ -19,9 +19,8 @@ if (NOT GGML_HOME)
        SOURCE_FILES
        ${SOURCE_FILES}
        ${WHISPER_LIB_DIR}/ggml/src/ggml.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
-        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
+        ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
        ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
        )
 endif()
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -7,7 +7,6 @@
 	objects = {
 /* Begin PBXBuildFile section */
 		18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */; };
 		1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
 		1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
 		18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
@ -22,7 +21,7 @@
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
 		18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
 		18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
-		18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
+		18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
 		18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
 		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
 		7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
@ -45,8 +44,6 @@
 /* End PBXCopyFilesBuildPhase section */
 /* Begin PBXFileReference section */
 		18133C7E2C64E342005CEAAC /* ggml-aarch64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-aarch64.h"; path = "../../../ggml/src/ggml-aarch64.h"; sourceTree = "<group>"; };
 		18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
 		184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
 		184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
 		1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
@ -73,7 +70,7 @@
 		18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
 		18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
 		18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
-		18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
+		18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
 		18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
 		18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
 		7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
@ -115,12 +112,10 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
 				18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
 				18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
 				18A275FF2C2A9563001C8D37 /* ggml-common.h */,
 				18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
 				18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
-				18ABE1572AF556340044A204 /* ggml-backend.cpp */,
+				18ABE1572AF556340044A204 /* ggml-backend.c */,
 				18ABE1552AF556340044A204 /* ggml-backend.h */,
 				18ABE1582AF556340044A204 /* ggml-impl.h */,
 				18ABE1592AF556340044A204 /* ggml-quants.c */,
@ -241,14 +236,13 @@
 			files = (
 				18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
 				18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
 				18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */,
 				7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
 				18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
 				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
 				1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
-				18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
+				18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
 				1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -50,24 +50,9 @@ else()
    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
 endif()
 if (CMAKE_CROSSCOMPILING)
    set(GGML_NATIVE_DEFAULT OFF)
 else()
    set(GGML_NATIVE_DEFAULT ON)
 endif()
 # defaults
 if (NOT GGML_LLAMAFILE_DEFAULT)
    set(GGML_LLAMAFILE_DEFAULT OFF)
 endif()
 if (NOT GGML_CUDA_GRAPHS_DEFAULT)
    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
 endif()
 # general
 option(GGML_STATIC "ggml: static link libraries"         OFF)
-option(GGML_NATIVE "ggml: enable -march=native flag"     ${GGML_NATIVE_DEFAULT})
+option(GGML_NATIVE "ggml: enable -march=native flag"     ON)
 option(GGML_LTO    "ggml: enable link time optimization" OFF)
 option(GGML_CCACHE "ggml: use ccache if available"       ON)
@ -85,7 +70,7 @@ option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
 option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
 # instruction set specific
-if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
+if (GGML_NATIVE)
    set(INS_ENB OFF)
 else()
    set(INS_ENB ON)
@ -119,13 +104,11 @@ option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"
 option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
 set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
-option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})
+option(GGML_LLAMAFILE                       "ggml: use ggml SGEMM"                            OFF)
 option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
 option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
 option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
 option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
 option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
 set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
 set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
 option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
@ -136,16 +119,14 @@ set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
 option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
 option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
 option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 option(GGML_CURL                            "ggml: use libcurl to download model from an URL" OFF)
 option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
 option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
 option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
 option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
 option(GGML_VULKAN_PERF                     "ggml: enable Vulkan perf output"                 OFF)
 option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
 option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
 option(GGML_KOMPUTE                         "ggml: use Kompute"                               OFF)
@ -211,20 +192,13 @@ endif ()
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
 # all public headers
 set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-alloc.h
    include/ggml-backend.h
-    include/ggml-blas.h
+    "${GGML_HEADERS_CUDA}"
-    include/ggml-cann.h
+    "${GGML_HEADERS_METAL}"
-    include/ggml-cuda.h
+    "${GGML_HEADERS_EXTRA}")
    include/ggml.h
    include/ggml-kompute.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-sycl.h
    include/ggml-vulkan.h)
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
 #if (GGML_METAL)
--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
@ -7,8 +7,8 @@ extern "C" {
 #endif
 typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-typedef struct      ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-typedef struct             ggml_backend * ggml_backend_t;
+typedef struct ggml_backend * ggml_backend_t;
 // Tensor allocator
 struct ggml_tallocr {
@ -24,7 +24,7 @@ GGML_API void                ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
 // Graph allocator
 /*
  Example usage:
-    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
+    ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
    // optional: create a worst-case graph and reserve the buffers to avoid reallocations
    ggml_gallocr_reserve(galloc, build_graph(max_batch));
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@ -12,52 +12,41 @@ extern "C" {
    typedef struct ggml_backend_event * ggml_backend_event_t;
    typedef struct ggml_backend * ggml_backend_t;
    typedef void * ggml_backend_graph_plan_t;
    typedef struct ggml_backend_reg * ggml_backend_reg_t;
    typedef struct ggml_backend_device * ggml_backend_dev_t;
    //
    // Backend buffer type
    //
    GGML_API const char *          ggml_backend_buft_name          (ggml_backend_buffer_type_t buft);
    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer  (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API size_t                ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_max_size  (ggml_backend_buffer_type_t buft);
    GGML_API size_t                ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API bool                  ggml_backend_buft_is_host       (ggml_backend_buffer_type_t buft);
    GGML_API ggml_backend_dev_t    ggml_backend_buft_get_device    (ggml_backend_buffer_type_t buft);
    //
    // Backend buffer
    //
    // buffer type
    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
    // buffer
    enum ggml_backend_buffer_usage {
        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
        GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
    };
-    GGML_API const char *                   ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
+    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void *                         ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t                         ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
+    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
-    GGML_API size_t                         ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void                           ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API bool                           ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API void                           ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
-    GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage     (ggml_backend_buffer_t buffer);
+    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
-    GGML_API ggml_backend_buffer_type_t     ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
+    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
    GGML_API void                           ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    //
-    // Backend (stream)
+    // Backend
    //
    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
@ -72,10 +61,8 @@ extern "C" {
    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    // "offset" refers to the offset of the tensor data for setting/getting data
+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_memset(   struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@ -85,118 +72,64 @@ extern "C" {
    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    // NOTE: will be removed, use device version instead
    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
    GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    // asynchronous copy
    // the copy is performed after all the currently queued operations in backend_src
    // backend_dst will wait for the copy to complete before performing other operations
    // automatic fallback to sync copy if async is not supported
    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
-    GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
+    // events
    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event);
    //
-    // Events
+    // CPU backend
    //
-    GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
    GGML_API void                 ggml_backend_event_free(ggml_backend_event_t event);
    GGML_API void                 ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
    GGML_API void                 ggml_backend_event_synchronize(ggml_backend_event_t event);
    GGML_API void                 ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
-    //
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    // Backend device
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
-    //
+    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
-    enum ggml_backend_dev_type {
+    // Create a backend buffer from an existing pointer
-        GGML_BACKEND_DEVICE_TYPE_CPU,
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
        GGML_BACKEND_DEVICE_TYPE_GPU,
        // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
        GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
        GGML_BACKEND_DEVICE_TYPE_GPU_FULL
    };
-    // functionality supported by the device
+    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
    struct ggml_backend_dev_caps {
        // asynchronous operations
        bool async;
        // pinned host buffer
        bool host_buffer;
        // event synchronization
        bool events;
    };
-    // all the device properties
+#ifdef GGML_USE_CPU_HBM
-    struct ggml_backend_dev_props {
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-        const char * name;
+#endif
        const char * description;
        size_t memory_free;
        size_t memory_total;
        enum ggml_backend_dev_type type;
        struct ggml_backend_dev_caps caps;
    };
    GGML_API const char *                  ggml_backend_dev_name(ggml_backend_dev_t device);
    GGML_API const char *                  ggml_backend_dev_description(ggml_backend_dev_t device);
    GGML_API void                          ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
    GGML_API enum ggml_backend_dev_type    ggml_backend_dev_type(ggml_backend_dev_t device);
    GGML_API void                          ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
    GGML_API ggml_backend_reg_t            ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
    GGML_API ggml_backend_t                ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_type_t    ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
    GGML_API ggml_backend_buffer_t         ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
    GGML_API bool                          ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
    GGML_API bool                          ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
    GGML_API bool                          ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
    //
    // Backend (reg)
    //
    GGML_API const char *       ggml_backend_reg_name(ggml_backend_reg_t reg);
    GGML_API size_t             ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
    GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
    GGML_API void *             ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
    // Functions that may be obtained using ggml_backend_reg_get_proc_address
    typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
    //
    // Backend registry
    //
-    // Backend (reg) enumeration
+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
    GGML_API size_t             ggml_backend_reg_count(void);
    GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
    GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
-    // Device enumeration
+    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t             ggml_backend_dev_count(void);
+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
-    GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
+    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-
+    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    // Direct backend (stream) initialization
+    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
    // = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
    GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
    GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
    // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
    GGML_API ggml_backend_t ggml_backend_init_best(void);
    //
    // Backend scheduler
    //
-    // The backend scheduler allows for multiple backend devices to be used together
+    // The backend scheduler allows for multiple backends to be used together
    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
    // The backends are selected based on:
    // - the backend that supports the operation
@ -231,9 +164,9 @@ extern "C" {
    }
    */
    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;
    // Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
    // when ask == true, the scheduler wants to know if the user wants to observe this node
    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
    //
@ -247,7 +180,7 @@ extern "C" {
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
-    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
    GGML_API int                  ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
    GGML_API ggml_backend_t       ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
@ -262,7 +195,7 @@ extern "C" {
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Allocate and compute graph on the backend scheduler
-    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
+    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
@ -288,7 +221,7 @@ extern "C" {
    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
    // Compare the output of two backends
    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
@ -297,26 +230,6 @@ extern "C" {
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
    GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
    //
    // CPU backend
    //
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
    GGML_API bool ggml_backend_is_cpu                (ggml_backend_t backend);
    GGML_API void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
    GGML_API void ggml_backend_cpu_set_threadpool    (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
    GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Create a backend buffer from an existing pointer
    GGML_API ggml_backend_buffer_t      ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
    GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 #ifdef GGML_USE_CPU_HBM
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-blas.h
+++ b/ggml/include/ggml-blas.h
@ -9,13 +9,13 @@ extern "C" {
 #endif
 // backend API
-GGML_API ggml_backend_t ggml_backend_blas_init(void);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
-GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
+GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
 // number of threads used for conversion to float
 // for openblas and blis, this will also set the number of threads used for blas operations
-GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
 #ifdef  __cplusplus
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -1,121 +0,0 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #pragma once
 #include "ggml-backend.h"
 #include "ggml.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /**
 * @brief Maximum number of CANN devices supported.
 */
 #define GGML_CANN_MAX_DEVICES 16
 /**
 * @brief Initializes the CANN backend for a specified device.
 *
 * This function initializes the CANN backend for the given device.
 * It verifies the device index, allocates a context, and creates a backend
 * instance.
 *
 * @param device The index of the device to initialize.
 * @return A pointer to the initialized backend instance, or nullptr on failure.
 */
 GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
 /**
 * @brief Checks if a given backend is a CANN backend.
 *
 * This function verifies if the provided backend is a CANN backend by comparing
 * its GUID with the CANN backend's GUID.
 *
 * @param backend The backend instance to check.
 * @return True if the backend is a CANN backend, false otherwise.
 */
 GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
 /**
 * @brief Retrieves the CANN buffer type for a specified device.
 *
 * This function initializes and returns the buffer type interface associated
 * with the given device. It ensures thread-safe access using a mutex.
 *
 * @param device The device index for which to retrieve the buffer type.
 * @return A pointer to the buffer type interface for the specified device, or
 * nullptr if the device index is out of range.
 */
 GGML_API ggml_backend_buffer_type_t
 ggml_backend_cann_buffer_type(int32_t device);
 /**
 * @brief Retrieves the number of CANN devices available.
 *
 * This function returns the number of CANN devices available based on
 * information obtained from `ggml_cann_info()`.
 *
 * @return The number of CANN devices available.
 */
 GGML_API int32_t ggml_backend_cann_get_device_count(void);
 /**
 * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
 *
 * @return A pointer to the host buffer type interface.
 */
 GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
 * This function sets the specified device, retrieves the SoC name,
 * and writes it into the provided description buffer.
 *
 * @param device The device index to retrieve the description for.
 * @param description Pointer to a buffer where the description will be written.
 * @param description_size Size of the description buffer.
 */
 GGML_API void ggml_backend_cann_get_device_description(
    int32_t device, char* description, size_t description_size);
 /**
 * @brief Retrieves the memory information of a specific CANN device.
 *
 * This function sets the specified device, retrieves the free and total
 * memory information of the specified type (ACL_HBM_MEM), and stores them
 * in the provided pointers.
 *
 * @param device The device index to retrieve memory information for.
 * @param free Pointer to a variable where the free memory size will be stored.
 * @param total Pointer to a variable where the total memory size will be
 * stored.
 */
 GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
                                                  size_t* free,
                                                  size_t* total);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml-cuda.h
+++ b/ggml/include/ggml-cuda.h
@ -3,45 +3,42 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #ifdef GGML_USE_HIPBLAS
 #define GGML_CUDA_NAME "ROCm"
 #define GGML_CUBLAS_NAME "hipBLAS"
 #elif defined(GGML_USE_MUSA)
 #define GGML_CUDA_NAME "MUSA"
 #define GGML_CUBLAS_NAME "muBLAS"
 #else
 #define GGML_CUDA_NAME "CUDA"
 #define GGML_CUBLAS_NAME "cuBLAS"
 #endif
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_CUDA_MAX_DEVICES       16
 // backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
 // device buffer
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-GGML_API int  ggml_backend_cuda_get_device_count(void);
+GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
-GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
-GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
+GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
-GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
+GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
 GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@ -1,5 +1,3 @@
 // Note: this description is outdated
 //
 // An interface allowing to compute ggml_cgraph with Metal
 //
 // This is a fully functional interface that extends ggml with GPU support for Apple devices.
@ -27,6 +25,9 @@
 #include <stddef.h>
 #include <stdbool.h>
 // max memory buffers that can be mapped to the device
 #define GGML_METAL_MAX_BUFFERS 64
 struct ggml_tensor;
 struct ggml_cgraph;
@ -39,15 +40,17 @@ extern "C" {
 // user-code should use only these functions
 //
 GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
 GGML_API ggml_backend_t ggml_backend_metal_init(void);
 GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
-GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
+GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
-GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
+GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
-GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
 // helper to check if the device supports a specific family
 // ideally, the user code should be doing these checks
@ -60,3 +63,4 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@ -10,14 +10,14 @@ extern "C" {
 #define GGML_RPC_MAX_SERVERS       16
 // backend API
-GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
-GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
+GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
-GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
-GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
+GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
-GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
+GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml-sycl.h
+++ b/ggml/include/ggml-sycl.h
@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-GGML_API void ggml_backend_sycl_print_sycl_devices(void);
+GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
-GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
+GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
-GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
+GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
-GGML_API int  ggml_backend_sycl_get_device_count();
+GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
-GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
+GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 // SYCL doesn't support registering host memory, keep here for reference
-// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
+// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
-// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
+// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml/include/ggml-vulkan.h
+++ b/ggml/include/ggml-vulkan.h
@ -13,16 +13,16 @@ extern "C" {
 GGML_API void ggml_vk_instance_init(void);
 // backend API
-GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
+GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
-GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
+GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
-GGML_API int  ggml_backend_vk_get_device_count(void);
+GGML_API GGML_CALL int  ggml_backend_vk_get_device_count(void);
-GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
+GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
-GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
+GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
-GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
+GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
 #ifdef  __cplusplus
 }
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@ -187,6 +187,16 @@
 #    define GGML_API
 #endif
 #ifdef GGML_MULTIPLATFORM
 #    if defined(_WIN32)
 #        define GGML_CALL
 #    else
 #        define GGML_CALL __attribute__((__ms_abi__))
 #    endif
 #else
 #    define GGML_CALL
 #endif
 // TODO: support for clang
 #ifdef __GNUC__
 #    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@ -210,7 +220,7 @@
 #include <stdio.h>
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 2
+#define GGML_FILE_VERSION 1
 #define GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
@ -219,16 +229,12 @@
 #define GGML_MAX_PARAMS         2048
 #define GGML_MAX_CONTEXTS       64
 #define GGML_MAX_SRC            10
 #define GGML_MAX_N_THREADS      512
 #define GGML_MAX_OP_PARAMS      64
 #ifndef GGML_MAX_NAME
-#   define GGML_MAX_NAME        64
+#define GGML_MAX_NAME           64
 #endif
-
+#define GGML_MAX_OP_PARAMS      64
 #define GGML_DEFAULT_N_THREADS  4
 #define GGML_DEFAULT_GRAPH_SIZE 2048
 #if UINTPTR_MAX == 0xFFFFFFFF
    #define GGML_MEM_ALIGN 4
 #else
@ -238,8 +244,6 @@
 #define GGML_EXIT_SUCCESS 0
 #define GGML_EXIT_ABORTED 1
 #define GGML_ROPE_TYPE_NEOX 2
 #define GGUF_MAGIC "GGUF"
 #define GGUF_VERSION 3
@ -250,27 +254,26 @@
 #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
 #define GGML_ASSERT(x) \
    do { \
        if (!(x)) { \
            fflush(stdout); \
            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
            ggml_print_backtrace(); \
            abort(); \
        } \
    } while (0)
 #ifndef NDEBUG
-#   define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
+#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
 #elif defined(__GNUC__)
-#   define GGML_UNREACHABLE() __builtin_unreachable()
+#define GGML_UNREACHABLE() __builtin_unreachable()
 #elif defined(_MSC_VER)
-#   define GGML_UNREACHABLE() __assume(0)
+#define GGML_UNREACHABLE() __assume(0)
 #else
-#   define GGML_UNREACHABLE() ((void) 0)
+#define GGML_UNREACHABLE() ((void) 0)
 #endif
 #ifdef __cplusplus
 #   define GGML_NORETURN [[noreturn]]
 #elif defined(_MSC_VER)
 #   define GGML_NORETURN __declspec(noreturn)
 #else
 #   define GGML_NORETURN _Noreturn
 #endif
 #define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
 #define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@ -319,9 +322,6 @@
 extern "C" {
 #endif
    GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
    GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
    enum ggml_status {
        GGML_STATUS_ALLOC_FAILED = -2,
        GGML_STATUS_FAILED = -1,
@ -330,7 +330,7 @@ extern "C" {
    };
    // get ggml_status name string
-    GGML_API const char * ggml_status_to_string(enum ggml_status status);
+    GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
    // ieee 754-2008 half-precision float16
    // todo: make this not an integral type
@ -345,12 +345,10 @@ extern "C" {
    GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
    GGML_API float       ggml_bf16_to_fp32(ggml_bf16_t);  // consider just doing << 16
    GGML_API void        ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
    GGML_API void        ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
    GGML_API void        ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
    struct ggml_object;
    struct ggml_context;
    struct ggml_cgraph;
    // NOTE: always add types at the end of the enum to keep backward compatibility
    enum ggml_type {
@ -385,11 +383,6 @@ extern "C" {
        GGML_TYPE_F64     = 28,
        GGML_TYPE_IQ1_M   = 29,
        GGML_TYPE_BF16    = 30,
        GGML_TYPE_Q4_0_4_4 = 31,
        GGML_TYPE_Q4_0_4_8 = 32,
        GGML_TYPE_Q4_0_8_8 = 33,
        GGML_TYPE_TQ1_0   = 34,
        GGML_TYPE_TQ2_0   = 35,
        GGML_TYPE_COUNT,
    };
@ -431,9 +424,6 @@ extern "C" {
        GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
        GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
        GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
    };
    // available tensor operations:
@ -450,13 +440,10 @@ extern "C" {
        GGML_OP_SQR,
        GGML_OP_SQRT,
        GGML_OP_LOG,
        GGML_OP_SIN,
        GGML_OP_COS,
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
        GGML_OP_ARGMAX,
        GGML_OP_COUNT_EQUAL,
        GGML_OP_REPEAT,
        GGML_OP_REPEAT_BACK,
        GGML_OP_CONCAT,
@ -490,11 +477,9 @@ extern "C" {
        GGML_OP_CLAMP,
        GGML_OP_CONV_TRANSPOSE_1D,
        GGML_OP_IM2COL,
        GGML_OP_IM2COL_BACK,
        GGML_OP_CONV_TRANSPOSE_2D,
        GGML_OP_POOL_1D,
        GGML_OP_POOL_2D,
        GGML_OP_POOL_2D_BACK,
        GGML_OP_UPSCALE, // nearest interpolate
        GGML_OP_PAD,
        GGML_OP_ARANGE,
@ -510,7 +495,6 @@ extern "C" {
        GGML_OP_WIN_UNPART,
        GGML_OP_GET_REL_POS,
        GGML_OP_ADD_REL_POS,
        GGML_OP_RWKV_WKV,
        GGML_OP_UNARY,
@ -527,7 +511,6 @@ extern "C" {
        GGML_OP_CROSS_ENTROPY_LOSS,
        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
        GGML_OP_OPT_STEP_ADAMW,
        GGML_OP_COUNT,
    };
@ -546,7 +529,6 @@ extern "C" {
        GGML_UNARY_OP_SILU,
        GGML_UNARY_OP_HARDSWISH,
        GGML_UNARY_OP_HARDSIGMOID,
        GGML_UNARY_OP_EXP,
        GGML_UNARY_OP_COUNT,
    };
@ -558,25 +540,35 @@ extern "C" {
    };
    enum ggml_log_level {
-        GGML_LOG_LEVEL_NONE  = 0,
+        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_INFO  = 1,
+        GGML_LOG_LEVEL_WARN  = 3,
-        GGML_LOG_LEVEL_WARN  = 2,
+        GGML_LOG_LEVEL_INFO  = 4,
-        GGML_LOG_LEVEL_ERROR = 3,
+        GGML_LOG_LEVEL_DEBUG = 5
        GGML_LOG_LEVEL_DEBUG = 4,
        GGML_LOG_LEVEL_CONT  = 5, // continue previous log
    };
    // this tensor...
    enum ggml_tensor_flag {
-        GGML_TENSOR_FLAG_INPUT  =  1, // ...is an input for the GGML compute graph
+        GGML_TENSOR_FLAG_INPUT  = 1,
-        GGML_TENSOR_FLAG_OUTPUT =  2, // ...is an output for the GGML compute graph
+        GGML_TENSOR_FLAG_OUTPUT = 2,
-        GGML_TENSOR_FLAG_PARAM  =  4, // ...contains trainable parameters
+        GGML_TENSOR_FLAG_PARAM  = 4,
        GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
    };
    // ggml object
    struct ggml_object {
        size_t offs;
        size_t size;
        struct ggml_object * next;
        enum ggml_object_type type;
        char padding[4];
    };
    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
    // n-dimensional tensor
    struct ggml_tensor {
-        enum ggml_type type;
+        enum ggml_type         type;
        GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
@ -619,29 +611,6 @@ extern "C" {
    // If it returns true, the computation is aborted
    typedef bool (*ggml_abort_callback)(void * data);
    // Scheduling priorities
    enum ggml_sched_priority {
        GGML_SCHED_PRIO_NORMAL,
        GGML_SCHED_PRIO_MEDIUM,
        GGML_SCHED_PRIO_HIGH,
        GGML_SCHED_PRIO_REALTIME
    };
    // Threadpool params
    // Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
    struct ggml_threadpool_params {
        bool                cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
        int                 n_threads;                   // number of threads
        enum ggml_sched_priority prio;                   // thread priority
        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
        bool                strict_cpu;                  // strict cpu placement
        bool                paused;                      // start in paused state
    };
    struct ggml_threadpool;     // forward declaration, see ggml.c
    typedef struct ggml_threadpool * ggml_threadpool_t;
    // the compute plan that needs to be prepared for ggml_graph_compute()
    // since https://github.com/ggerganov/ggml/issues/287
    struct ggml_cplan {
@ -649,13 +618,38 @@ extern "C" {
        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
        int n_threads;
        struct ggml_threadpool * threadpool;
        // abort ggml_graph_compute when true
        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
    };
    enum ggml_cgraph_eval_order {
        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
        GGML_CGRAPH_EVAL_ORDER_COUNT
    };
    struct ggml_hash_set {
        size_t size;
        struct ggml_tensor ** keys;
    };
    // computation graph
    struct ggml_cgraph {
        int size;
        int n_nodes;
        int n_leafs;
        struct ggml_tensor ** nodes;
        struct ggml_tensor ** grads;
        struct ggml_tensor ** leafs;
        struct ggml_hash_set visited_hash_table;
        enum ggml_cgraph_eval_order order;
    };
    // scratch buffer
    struct ggml_scratch {
        size_t offs;
@ -698,6 +692,8 @@ extern "C" {
    GGML_API int64_t ggml_cycles(void);
    GGML_API int64_t ggml_cycles_per_ms(void);
    GGML_API void    ggml_print_backtrace(void);
    // accepts a UTF-8 path, even on Windows
    GGML_API FILE *  ggml_fopen(const char * fname, const char * mode);
@ -707,52 +703,50 @@ extern "C" {
    GGML_API void    ggml_print_object (const struct ggml_object * obj);
    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-    GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API int64_t ggml_nrows     (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes    (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+    GGML_API           size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-    GGML_API int64_t ggml_blck_size(enum ggml_type type);
+    GGML_API GGML_CALL int    ggml_blck_size(enum ggml_type type);
-    GGML_API size_t  ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API size_t  ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
    GGML_DEPRECATED(
    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
    "use ggml_row_size() instead");
-    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
-    GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API GGML_CALL const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
+    GGML_API           const char * ggml_op_symbol(enum ggml_op   op);
-    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API           const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+    GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
-    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL size_t  ggml_element_size(const struct ggml_tensor * tensor);
-    GGML_API bool    ggml_is_quantized(enum ggml_type type);
+    GGML_API GGML_CALL bool    ggml_is_quantized(enum ggml_type type);
    // TODO: temporary until model loading of ggml examples is refactored
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
-    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_empty     (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_empty     (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API           bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+    GGML_API           int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
-    GGML_API bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL bool ggml_is_contiguous  (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
+    GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
-    GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
+    GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
-    GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
+    GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
    GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
    // use this to compute the memory overhead of a tensor
    GGML_API size_t ggml_tensor_overhead(void);
@ -838,7 +832,7 @@ extern "C" {
    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+    GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
@ -959,22 +953,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return scalar
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
@ -995,12 +973,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // count number of equal elements in a and b
    GGML_API struct ggml_tensor * ggml_count_equal(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // if a is the same shape as b, and a is not parameter, return a
    // otherwise, return a new tensor: repeat(a) to fit in b
    GGML_API struct ggml_tensor * ggml_repeat(
@ -1131,14 +1103,6 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_exp(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_exp_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // normalize along rows
    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
@ -1162,17 +1126,16 @@ extern "C" {
    // group normalize along ne0*ne1*n_groups
    // used in stable-diffusion
    // TODO: eps is hardcoded to 1e-6 for now
    GGML_API struct ggml_tensor * ggml_group_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_groups,
+            int                   n_groups);
            float                 eps);
    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
-            int                   n_groups,
+            int                   n_groups);
            float                 eps);
    // a - x
    // b - dy
@ -1234,7 +1197,7 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset); // in bytes
+            size_t                offset);
    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_inplace(
@ -1244,19 +1207,19 @@ extern "C" {
            size_t                nb1,
            size_t                nb2,
            size_t                nb3,
-            size_t                offset); // in bytes
+            size_t                offset);
    GGML_API struct ggml_tensor * ggml_set_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset); // in bytes
+            size_t                offset);
    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
-            size_t                offset); // in bytes
+            size_t                offset);
    // b -> view(a,offset,nb1,nb2,3), return modified a
    GGML_API struct ggml_tensor * ggml_set_2d(
@ -1264,7 +1227,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset); // in bytes
+            size_t                offset);
    // b -> view(a,offset,nb1,nb2,3), return view(a)
    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@ -1272,7 +1235,7 @@ extern "C" {
            struct ggml_tensor  * a,
            struct ggml_tensor  * b,
            size_t                nb1,
-            size_t                offset); // in bytes
+            size_t                offset);
    // a -> b, return view(b)
    GGML_API struct ggml_tensor * ggml_cpy(
@ -1407,14 +1370,14 @@ extern "C" {
    // supports 3D: a->ne[2] == b->ne[1]
    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,  // data
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b); // row indices
+            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_get_rows_back(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,  // gradients of ggml_get_rows result
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,  // row indices
+            struct ggml_tensor  * b,
-            struct ggml_tensor  * c); // data for ggml_get_rows, only used for its shape
+            struct ggml_tensor  * c);
    GGML_API struct ggml_tensor * ggml_diag(
        struct ggml_context     * ctx,
@ -1475,10 +1438,11 @@ extern "C" {
            struct ggml_tensor  * b);
    // rotary position embedding
-    // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
+    // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
-    // if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
+    // if mode & 2 == 1, GPT-NeoX style
    //
    // b is an int32 vector with size a->ne[2], it contains the positions
    // c is freq factors (e.g. phi3-128k), (optional)
    GGML_API struct ggml_tensor * ggml_rope(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1495,7 +1459,6 @@ extern "C" {
            int                   mode);
    // custom RoPE
    // c is freq factors (e.g. phi3-128k), (optional)
    GGML_API struct ggml_tensor * ggml_rope_ext(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
@ -1558,16 +1521,16 @@ extern "C" {
        "use ggml_rope_ext_inplace instead");
    // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
+    GGML_CALL void ggml_rope_yarn_corr_dims(
        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
    // rotary position embedding backward, i.e compute dx from dy
    // a - dy
    GGML_API struct ggml_tensor * ggml_rope_back(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a, // gradients of ggml_rope result
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b, // positions
+            struct ggml_tensor  * b,
-            struct ggml_tensor  * c, // freq factors
+            struct ggml_tensor  * c,
            int                   n_dims,
            int                   mode,
            int                   n_ctx_orig,
@ -1586,49 +1549,34 @@ extern "C" {
            float                 min,
            float                 max);
    // im2col
    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
    GGML_API struct ggml_tensor * ggml_im2col(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,  // data
+            struct ggml_tensor  * b,
-            int                   s0, // stride dimension 0
+            int                  s0,
-            int                   s1, // stride dimension 1
+            int                  s1,
-            int                   p0, // padding dimension 0
+            int                  p0,
-            int                   p1, // padding dimension 1
+            int                  p1,
-            int                   d0, // dilation dimension 0
+            int                  d0,
-            int                   d1, // dilation dimension 1
+            int                  d1,
-            bool                  is_2D,
+            bool                 is_2D,
-            enum ggml_type        dst_type);
+            enum ggml_type       dst_type);
    GGML_API struct ggml_tensor * ggml_im2col_back(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,  // convolution kernel
        struct ggml_tensor  * b,  // gradient of im2col output
        int64_t             * ne, // shape of im2col input
        int                   s0, // stride dimension 0
        int                   s1, // stride dimension 1
        int                   p0, // padding dimension 0
        int                   p1, // padding dimension 1
        int                   d0, // dilation dimension 0
        int                   d1, // dilation dimension 1
        bool                  is_2D);
    GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,  // data
+            struct ggml_tensor  * b,
-            int                  s0,  // stride dimension 0
+            int                  s0,
-            int                  s1,  // stride dimension 1
+            int                  s1,
-            int                  p0,  // padding dimension 0
+            int                  p0,
-            int                  p1,  // padding dimension 1
+            int                  p1,
-            int                  d0,  // dilation dimension 0
+            int                  d0,
-            int                  d1); // dilation dimension 1
+            int                  d1);
    GGML_API struct ggml_tensor * ggml_conv_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,   // data
+            struct ggml_tensor  * b,
            int                   s0,  // stride
            int                   p0,  // padding
            int                   d0); // dilation
@ -1637,29 +1585,29 @@ extern "C" {
    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,  // convolution kernel
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,  // data
+            struct ggml_tensor  * b,
-            int                   s,  // stride
+            int                   s,
-            int                   d); // dilation
+            int                   d);
    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,   // data
+            struct ggml_tensor  * b,
-            int                   s0,  // stride
+            int                   s0,
-            int                   p0,  // padding
+            int                   p0,
-            int                   d0); // dilation
+            int                   d0);
    GGML_API struct ggml_tensor * ggml_conv_2d(
            struct ggml_context * ctx,
-            struct ggml_tensor  * a,   // convolution kernel
+            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,   // data
+            struct ggml_tensor  * b,
-            int                   s0,  // stride dimension 0
+            int                   s0,
-            int                   s1,  // stride dimension 1
+            int                   s1,
-            int                   p0,  // padding dimension 0
+            int                   p0,
-            int                   p1,  // padding dimension 1
+            int                   p1,
-            int                   d0,  // dilation dimension 0
+            int                   d0,
-            int                   d1); // dilation dimension 1
+            int                   d1);
    // kernel size is a->ne[0] x a->ne[1]
@ -1721,18 +1669,6 @@ extern "C" {
            float                 p0,
            float                 p1);
    GGML_API struct ggml_tensor * ggml_pool_2d_back(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * af, // "a"/input used in forward pass
            enum ggml_op_pool     op,
            int                   k0,
            int                   k1,
            int                   s0,
            int                   s1,
            float                 p0,
            float                 p1);
    // nearest interpolate
    // multiplies ne0 and ne1 by scale factor
    // used in stable-diffusion
@ -1807,8 +1743,7 @@ extern "C" {
            struct ggml_tensor  * v,
            struct ggml_tensor  * mask,
            float                 scale,
-            float                 max_bias,
+            float                 max_bias);
            float                 logit_softcap);
    GGML_API void ggml_flash_attn_ext_set_prec(
            struct ggml_tensor * a,
@ -1825,8 +1760,10 @@ extern "C" {
    GGML_API struct ggml_tensor * ggml_ssm_conv(
            struct ggml_context * ctx,
-            struct ggml_tensor  * sx,
+            struct ggml_tensor  * s,
-            struct ggml_tensor  * c);
+            struct ggml_tensor  * x,
            struct ggml_tensor  * c,
            struct ggml_tensor  * sq);
    GGML_API struct ggml_tensor * ggml_ssm_scan(
            struct ggml_context * ctx,
@ -1835,7 +1772,8 @@ extern "C" {
            struct ggml_tensor  * dt,
            struct ggml_tensor  * A,
            struct ggml_tensor  * B,
-            struct ggml_tensor  * C);
+            struct ggml_tensor  * C,
            struct ggml_tensor  * sq);
    // partition into non-overlapping windows with padding if needed
    // example:
@ -1887,15 +1825,6 @@ extern "C" {
            struct ggml_tensor  * pw,
            struct ggml_tensor  * ph);
    GGML_API struct ggml_tensor * ggml_rwkv_wkv(
            struct ggml_context * ctx,
            struct ggml_tensor  * k,
            struct ggml_tensor  * v,
            struct ggml_tensor  * r,
            struct ggml_tensor  * tf,
            struct ggml_tensor  * td,
            struct ggml_tensor  * state);
    // custom operators
    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@ -1979,8 +1908,7 @@ extern "C" {
    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
-#define GGML_N_TASKS_MAX (-1)
+    #define GGML_N_TASKS_MAX -1
    // n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
    GGML_API struct ggml_tensor * ggml_map_custom1(
            struct ggml_context   * ctx,
@ -2033,84 +1961,44 @@ extern "C" {
    // loss function
    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
-            struct ggml_context * ctx,
+            struct ggml_context         * ctx,
-            struct ggml_tensor  * a,  // logits
+            struct ggml_tensor          * a,
-            struct ggml_tensor  * b); // labels
+            struct ggml_tensor          * b);
    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
-            struct ggml_context * ctx,
+            struct ggml_context         * ctx,
-            struct ggml_tensor  * a,  // logits
+            struct ggml_tensor          * a,
-            struct ggml_tensor  * b,  // labels
+            struct ggml_tensor          * b,
-            struct ggml_tensor  * c); // gradients of cross_entropy_loss result
+            struct ggml_tensor          * c);
    // AdamW optimizer step
    // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
    // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
    GGML_API struct ggml_tensor * ggml_opt_step_adamw(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * grad,
            float                 alpha,
            float                 beta1,
            float                 beta2,
            float                 eps,
            float                 wd); // weight decay
    //
    // automatic differentiation
    //
-    GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API void ggml_set_param(
-    GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
+            struct ggml_context * ctx,
            struct ggml_tensor  * tensor);
    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
    GGML_API void ggml_build_opt_adamw(
            struct ggml_context * ctx,
            struct ggml_cgraph  * gf,
            struct ggml_cgraph  * gb,
            float                 alpha,
            float                 beta1,
            float                 beta2,
            float                 eps,
            float                 wd); // weight decay
    // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
+    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
+    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-
+    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
    GGML_API size_t ggml_graph_overhead(void);
    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
    // ggml_graph_plan() has to be called before ggml_graph_compute()
    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
+    GGML_API struct ggml_cplan ggml_graph_plan            (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-                  const struct ggml_cgraph * cgraph,
+    GGML_API enum ggml_status  ggml_graph_compute         (      struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
                    struct ggml_threadpool * threadpool /* = NULL */ );
    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
    // same as ggml_graph_compute() but the work data is allocated as a part of the context
    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
@ -2174,10 +2062,6 @@ extern "C" {
    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
    // Set callback for all future logging events.
    // If this is not called, or NULL is supplied, everything is output on stderr.
    GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
    // optimization parameters
    //
    //   see ggml.c (ggml_opt_default_params) for default values
@ -2503,16 +2387,10 @@ extern "C" {
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_riscv_v    (void);
    GGML_API int ggml_cpu_has_sycl       (void);
    GGML_API int ggml_cpu_has_rpc        (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    GGML_API int ggml_cpu_has_matmul_int8(void);
    GGML_API int ggml_cpu_has_cann       (void);
    GGML_API int ggml_cpu_has_llamafile  (void);
    // get the sve vector length in bytes
    GGML_API int ggml_cpu_get_sve_cnt(void);
    //
    // Internal types and functions exposed for tests and benchmarks
@ -2526,31 +2404,20 @@ extern "C" {
 #endif
    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-    typedef void (*ggml_from_float_to_mat_t)
+    typedef void (*ggml_vec_dot_t)   (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                     (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
+                                      const void * GGML_RESTRICT y, size_t by, int nrc);
    typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                       const void * GGML_RESTRICT y, size_t by, int nrc);
    typedef void (*ggml_gemv_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                       const void * GGML_RESTRICT y, int nr, int nc);
    typedef void (*ggml_gemm_t)     (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
                                       const void * GGML_RESTRICT y, int nr, int nc);
    typedef struct {
-        const char             * type_name;
+        const char      * type_name;
-        int64_t                  blck_size;
+        int               blck_size;
-        int64_t                  blck_size_interleave; // interleave elements in blocks
+        size_t            type_size;
-        size_t                   type_size;
+        bool              is_quantized;
-        bool                     is_quantized;
+        ggml_to_float_t   to_float;
-        ggml_to_float_t          to_float;
+        ggml_from_float_t from_float;
-        ggml_from_float_t        from_float;
+        ggml_from_float_t from_float_reference;
-        ggml_from_float_t        from_float_ref;
+        ggml_vec_dot_t    vec_dot;
-        ggml_from_float_to_mat_t from_float_to_mat;
+        enum ggml_type    vec_dot_type;
-        ggml_vec_dot_t           vec_dot;
+        int64_t           nrows; // number of rows to process simultaneously;
        enum ggml_type           vec_dot_type;
        int64_t                  nrows; // number of rows to process simultaneously
        int64_t                  ncols; // number of columns to process simultaneously
        ggml_gemv_t              gemv;
        ggml_gemm_t              gemm;
    } ggml_type_traits_t;
    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@ -26,9 +26,6 @@ if (NOT MSVC)
    endif()
 endif()
 unset(GGML_EXTRA_LIBS_PRIVATE)
 unset(GGML_EXTRA_LIBS_PUBLIC)
 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    if (ACCELERATE_FRAMEWORK)
@ -38,7 +35,7 @@ if (APPLE AND GGML_ACCELERATE)
        add_compile_definitions(ACCELERATE_NEW_LAPACK)
        add_compile_definitions(ACCELERATE_LAPACK_ILP64)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
    else()
        message(WARNING "Accelerate framework not found")
    endif()
@ -90,7 +87,7 @@ if (GGML_METAL)
            COMMENT "Generate assembly for embedded Metal library"
        )
-        list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
+        set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
    else()
        if (GGML_METAL_SHADER_DEBUG)
            # custom command to do the following:
@ -135,24 +132,13 @@ if (GGML_METAL)
            )
    endif() # GGML_METAL_EMBED_LIBRARY
-    list(APPEND GGML_EXTRA_LIBS_PRIVATE
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
        )
 endif()
 if (GGML_MUSA)
    set(CMAKE_C_COMPILER clang)
    set(CMAKE_C_EXTENSIONS OFF)
    set(CMAKE_CXX_COMPILER clang++)
    set(CMAKE_CXX_EXTENSIONS OFF)
    set(GGML_CUDA ON)
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
 endif()
 if (GGML_OPENMP)
    find_package(OpenMP)
    if (OpenMP_FOUND)
@ -160,12 +146,7 @@ if (GGML_OPENMP)
        add_compile_definitions(GGML_USE_OPENMP)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
        if (GGML_MUSA)
            list(APPEND GGML_EXTRA_INCLUDES     "/usr/lib/llvm-10/include/openmp")
            list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
        endif()
    else()
        message(WARNING "OpenMP not found")
    endif()
@ -247,8 +228,8 @@ if (GGML_BLAS)
        set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
        set(GGML_SOURCES_BLAS ggml-blas.cpp)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     ${BLAS_LIBRARIES})
-        list(APPEND GGML_EXTRA_INCLUDES     ${BLAS_INCLUDE_DIRS})
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -257,24 +238,18 @@ if (GGML_BLAS)
 endif()
 if (GGML_LLAMAFILE)
-    message(STATUS "Using llamafile")
+    message(STATUS "Using ggml SGEMM")
    add_compile_definitions(GGML_USE_LLAMAFILE)
-    set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
+    set(GGML_HEADERS_LLAMAFILE sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
+    set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
 endif()
 if (GGML_CUDA)
    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-    if (GGML_MUSA)
+    find_package(CUDAToolkit)
        list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
        find_package(MUSAToolkit)
        set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
    else()
        find_package(CUDAToolkit)
    endif()
    if (CUDAToolkit_FOUND)
        message(STATUS "CUDA found")
@ -293,11 +268,7 @@ if (GGML_CUDA)
        endif()
        message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-        if (GGML_MUSA)
+        enable_language(CUDA)
            set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
        else()
            enable_language(CUDA)
        endif()
        file(GLOB   GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
        list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@ -324,15 +295,21 @@ if (GGML_CUDA)
        list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
        # TODO: for now CUDA graphs should be used only with llama.cpp
        #       https://github.com/ggerganov/whisper.cpp/issues/2258
        message(STATUS "CMAKE_PROJECT_NAME: ${CMAKE_PROJECT_NAME}")
        if (CMAKE_PROJECT_NAME STREQUAL "llama.cpp")
            add_compile_definitions(GGML_CUDA_USE_GRAPHS)
            message(STATUS "GGML_CUDA_USE_GRAPHS enabled")
        else()
            message(STATUS "GGML_CUDA_USE_GRAPHS disabled")
        endif()
        add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
        add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
        add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
        if (GGML_CUDA_GRAPHS)
            add_compile_definitions(GGML_CUDA_USE_GRAPHS)
        endif()
        if (GGML_CUDA_FORCE_DMMV)
            add_compile_definitions(GGML_CUDA_FORCE_DMMV)
        endif()
@ -361,40 +338,21 @@ if (GGML_CUDA)
            add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
        endif()
        if (GGML_MUSA)
            set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
            foreach(SOURCE ${GGML_SOURCES_CUDA})
                set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
            endforeach()
        endif()
        if (GGML_STATIC)
            if (WIN32)
                # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
-                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
-                if (GGML_MUSA)
+                set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
                    list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
                else()
                    list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
                endif()
            endif()
        else()
-            if (GGML_MUSA)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
            else()
                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
            endif()
        endif()
        if (GGML_CUDA_NO_VMM)
            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
        else()
-            if (GGML_MUSA)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
                list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
            else()
                list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
            endif()
        endif()
    else()
        message(WARNING "CUDA not found")
@ -488,17 +446,13 @@ if (GGML_HIPBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_MMQ)
    endif()
    if (GGML_CUDA_FORCE_CUBLAS)
        add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
    endif()
    if (GGML_CUDA_NO_PEER_COPY)
        add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
    endif()
    if (CXX_IS_HIPCC)
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
    else()
        set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
    endif()
@ -507,34 +461,27 @@ if (GGML_HIPBLAS)
        message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
    endif()
-    list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
+    set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
 endif()
 if (GGML_SYCL)
-    if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
+    if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
-        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
+        message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
    endif()
-    check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
-
+        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
    if (DEFINED ENV{ONEAPI_ROOT})
        message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
    elseif(SUPPORTS_SYCL)
        message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
         If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
         source /opt/intel/oneapi/setvars.sh")
    else()
        message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
    endif()
    message(STATUS "SYCL found")
    #todo: AOT
    find_package(IntelSYCL REQUIRED)
    find_package(MKL REQUIRED)
    message(STATUS "SYCL found")
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
    if (GGML_SYCL_F16)
        if (GGML_SYCL_TARGET STREQUAL "AMD")
            message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
        endif()
        add_compile_definitions(GGML_SYCL_F16)
    endif()
@ -542,18 +489,12 @@ if (GGML_SYCL)
        add_compile_definitions(GGML_SYCL_FORCE_MMQ)
    endif()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
+    add_compile_options(-I./) #include DPCT
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
    elseif (GGML_SYCL_TARGET STREQUAL "AMD")
        # INFO: Allowed Sub_group_sizes are not consistent through all
        # hip targets. For example, 64 is used for certain models, but the backend
        # does not support it.
        # Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
        add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
    else()
        add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
    endif()
    file(GLOB   GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
@ -562,35 +503,16 @@ if (GGML_SYCL)
    file(GLOB   GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
    list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
    find_package(DNNL)
    message("-- DNNL found:" ${DNNL_FOUND})
    if (GGML_SYCL_TARGET STREQUAL "INTEL")
        add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
    else()
        add_compile_definitions(GGML_SYCL_DNNL=0)
    endif()
    if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
        list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
    endif()
    if (WIN32)
-        find_package(IntelSYCL REQUIRED)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
        find_package(MKL REQUIRED)
        list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
    else()
        add_compile_options(-I/${SYCL_INCLUDE_DIR})
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
        if (GGML_SYCL_TARGET STREQUAL "INTEL")
-            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
        elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
+            set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
        elseif (GGML_SYCL_TARGET STREQUAL "AMD")
            if (GGML_SYCL_HIP_TARGET STREQUAL "")
                message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
            endif()
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
            list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
        endif()
    endif()
 endif()
@ -601,7 +523,7 @@ if (GGML_RPC)
    list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
    if (WIN32)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
    endif()
    set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@ -609,11 +531,14 @@ if (GGML_RPC)
 endif()
 if (GGML_VULKAN)
-    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_package(Vulkan)
    if (Vulkan_FOUND)
        message(STATUS "Vulkan found")
        set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h)
        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
        list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
        # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
@ -634,14 +559,6 @@ if (GGML_VULKAN)
            add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
        endif()
        if (GGML_VULKAN_SHADER_DEBUG_INFO)
            add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
        endif()
        if (GGML_VULKAN_PERF)
            add_compile_definitions(GGML_VULKAN_PERF)
        endif()
        if (GGML_VULKAN_VALIDATE)
            add_compile_definitions(GGML_VULKAN_VALIDATE)
        endif()
@ -650,37 +567,7 @@ if (GGML_VULKAN)
            add_compile_definitions(GGML_VULKAN_RUN_TESTS)
        endif()
-        add_subdirectory(vulkan-shaders)
+        set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
        set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
        set (_ggml_vk_header     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
        set (_ggml_vk_source     ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
        set (_ggml_vk_input_dir  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
        set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
        file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
        add_custom_command(
            OUTPUT ${_ggml_vk_header}
                   ${_ggml_vk_source}
            COMMAND ${_ggml_vk_genshaders_cmd}
                --glslc      ${Vulkan_GLSLC_EXECUTABLE}
                --input-dir  ${_ggml_vk_input_dir}
                --output-dir ${_ggml_vk_output_dir}
                --target-hpp ${_ggml_vk_header}
                --target-cpp ${_ggml_vk_source}
                --no-clean
            DEPENDS ${_ggml_vk_shader_deps}
            COMMENT "Generate vulkan shaders"
        )
        set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
        set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
        list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Vulkan not found")
    endif()
@ -839,8 +726,8 @@ if (GGML_KOMPUTE)
        list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
-        list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
+        set(GGML_EXTRA_LIBS     ${GGML_EXTRA_LIBS}     kompute)
-        list(APPEND GGML_EXTRA_INCLUDES     ${CMAKE_CURRENT_BINARY_DIR})
+        set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
    else()
        message(WARNING "Kompute not found")
    endif()
@ -856,71 +743,6 @@ if (GGML_CPU_HBM)
    target_link_libraries(ggml PUBLIC memkind)
 endif()
 if (GGML_CANN)
    if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
        set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
        message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
    endif()
    if (CANN_INSTALL_DIR)
        # Only Support Linux.
        if (GGML_CANN)
            if (NOT UNIX)
                set(GGML_CANN OFF)
                message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
            endif()
        endif()
        # Supported platforms: x86-64, arm64
        if (GGML_CANN)
            if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
            elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
            else()
                set(GGML_CANN OFF)
                message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
            endif()
        endif()
        # Set header and libs
        if(GGML_CANN)
            set(CANN_INCLUDE_DIRS
                ${CANN_INSTALL_DIR}/include
                ${CANN_INSTALL_DIR}/include/aclnn
                ${CANN_INSTALL_DIR}/acllib/include
            )
            add_subdirectory(ggml-cann/kernels)
            list(APPEND CANN_LIBRARIES
                ascendcl
                nnopbase
                opapi
                acl_op_compiler
                ascendc_kernels
            )
            set(GGML_HEADERS_CANN "../include/ggml-cann.h")
            file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
            list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
            message(STATUS "CANN: CANN_INCLUDE_DIRS =  ${CANN_INCLUDE_DIRS}")
            message(STATUS "CANN: CANN_LIBRARIES =  ${CANN_LIBRARIES}")
            list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
            list(APPEND GGML_EXTRA_INCLUDES     ${CANN_INCLUDE_DIRS})
            list(APPEND GGML_EXTRA_LIBDIRS      ${CANN_INSTALL_DIR}/lib64)
            list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
        endif()
    else()
        set(GGML_CANN OFF)
        message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
    endif()
    if(NOT GGML_CANN)
        message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
    endif()
 endif()
 function(get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")
@ -939,10 +761,8 @@ function(get_flags CCID CCVER)
        set(C_FLAGS   -Wdouble-promotion)
        set(CXX_FLAGS -Wno-array-bounds)
-        if (NOT GGML_MUSA)
+        if (CCVER VERSION_GREATER_EQUAL 7.1.0)
-            if (CCVER VERSION_GREATER_EQUAL 7.1.0)
+            list(APPEND CXX_FLAGS -Wno-format-truncation)
                list(APPEND CXX_FLAGS -Wno-format-truncation)
            endif()
        endif()
        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            list(APPEND CXX_FLAGS -Wextra-semi)
@ -1201,7 +1021,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
        endif()
        if (GGML_AVX512)
            list(APPEND ARCH_FLAGS -mavx512f)
            list(APPEND ARCH_FLAGS -mavx512dq)
            list(APPEND ARCH_FLAGS -mavx512bw)
        endif()
        if (GGML_AVX512_VBMI)
@ -1275,7 +1094,7 @@ endif()
 # Data types, macros and functions related to controlling CPU affinity and
 # some memory allocation are available on Linux through GNU extensions in libc
-if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
+if (CMAKE_SYSTEM_NAME MATCHES "Linux")
    add_compile_definitions(_GNU_SOURCE)
 endif()
@ -1325,7 +1144,7 @@ add_library(ggml
            ../include/ggml-backend.h
            ggml.c
            ggml-alloc.c
-            ggml-backend.cpp
+            ggml-backend.c
            ggml-quants.c
            ggml-quants.h
            ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
@ -1338,34 +1157,24 @@ add_library(ggml
            ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
            ${GGML_SOURCES_BLAS}      ${GGML_HEADERS_BLAS}
            ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
            ${GGML_SOURCES_CANN}      ${GGML_HEADERS_CANN}
            ggml-aarch64.c            ggml-aarch64.h
            )
 if (EMSCRIPTEN)
    set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
 endif()
-target_compile_definitions(ggml PUBLIC    ${GGML_CDEF_PUBLIC})
+target_compile_definitions(ggml PUBLIC  ${GGML_CDEF_PUBLIC})
-target_include_directories(ggml PUBLIC  ../include)
+target_include_directories(ggml PUBLIC ../include)
 target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
 target_link_directories   (ggml PRIVATE   ${GGML_EXTRA_LIBDIRS})
 target_compile_features   (ggml PRIVATE c_std_11) # don't bump
-list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
+target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
 find_library(MATH_LIBRARY m)
 if (MATH_LIBRARY)
-    if (NOT WIN32 OR NOT GGML_SYCL)
+    target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
        list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
    endif()
 endif()
 list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
 list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
 target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
 if (BUILD_SHARED_LIBS)
    set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
 endif()
--- a/ggml/src/ggml-aarch64.c
+++ b/ggml/src/ggml-aarch64.c
--- a/ggml/src/ggml-aarch64.h
+++ b/ggml/src/ggml-aarch64.h
@ -1,39 +0,0 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
 #pragma once
 #define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "ggml.h"
 // GGML internal header
 #ifdef __cplusplus
 extern "C" {
 #endif
 // Quantization
 void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 // GEMV
 void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 // GEMM
 void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
@ -91,7 +91,8 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
    if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
-        GGML_ABORT("not enough space in the buffer");
+        GGML_ASSERT(!"not enough space in the buffer");
        return;
    }
    void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@ -132,7 +133,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
            return;
        }
    }
-    GGML_ABORT("out of allocated_tensors");
+    GGML_ASSERT(!"out of allocated_tensors");
 }
 static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
    for (int i = 0; i < 1024; i++) {
@ -141,7 +142,8 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
            return;
        }
    }
-    GGML_ABORT("tried to free tensor %s not found\n", tensor->name);
+    fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
    GGML_ASSERT(!"tensor not found");
 }
 #endif
@ -174,7 +176,8 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
            // this should never happen
            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                    __func__, size, max_avail);
-            GGML_ABORT("not enough space in the buffer");
+            GGML_ASSERT(!"not enough space in the buffer");
            GGML_UNREACHABLE();
        }
    }
@ -294,12 +297,6 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
    alloc->free_blocks[0].offset = 0;
    alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
    alloc->max_size = 0;
 #ifdef GGML_ALLOCATOR_DEBUG
    for (int i = 0; i < 1024; i++) {
        alloc->allocated_tensors[i].tensor = NULL;
    }
 #endif
 }
 static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
@ -446,7 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
        }
    }
-    ggml_hash_set_free(&galloc->hash_set);
+    free(galloc->hash_set.keys);
    free(galloc->hash_values);
    free(galloc->bufts);
    free(galloc->buffers);
@ -459,7 +456,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
 typedef struct ggml_gallocr * ggml_gallocr_t;
 static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
-    size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t);
+    size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
    return &galloc->hash_values[i];
 }
@ -568,8 +565,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
 static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
    // clear hash tables
-    ggml_hash_set_reset(&galloc->hash_set);
+    memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
-    memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
+    memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
    // allocate leafs
    // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@ -674,19 +671,21 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
 }
 bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
-    size_t min_hash_size = graph->n_nodes + graph->n_leafs;
+    size_t hash_size = graph->visited_hash_table.size;
    // add 25% margin to avoid hash collisions
    min_hash_size += min_hash_size / 4;
    // initialize hash table
-    if (galloc->hash_set.size < min_hash_size) {
+    if (galloc->hash_set.size < hash_size) {
-        ggml_hash_set_free(&galloc->hash_set);
+        free(galloc->hash_set.keys);
        galloc->hash_set = ggml_hash_set_new(min_hash_size);
        GGML_ASSERT(galloc->hash_set.keys != NULL);
        free(galloc->hash_values);
-        galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size);
+        galloc->hash_set.size = hash_size;
        galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
        galloc->hash_values   = calloc(hash_size, sizeof(struct hash_node));
        GGML_ASSERT(galloc->hash_set.keys != NULL);
        GGML_ASSERT(galloc->hash_values != NULL);
    } else {
        // reset hash table
        memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
        memset(galloc->hash_values,   0, sizeof(struct hash_node) * galloc->hash_set.size);
    }
    // reset allocators
@ -777,7 +776,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
                return false;
            }
            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
        }
    }
@ -818,7 +816,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
 }
 static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
-    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
+    ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
    size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
    return talloc->size_max >= node_size;
 }
--- a/ggml/src/ggml-backend-impl.h
+++ b/ggml/src/ggml-backend-impl.h
@ -9,226 +9,144 @@ extern "C" {
 #endif
    //
-    // Backend buffer type
+    // Backend buffer
    //
    // buffer type
    typedef void * ggml_backend_buffer_type_context_t;
    struct ggml_backend_buffer_type_i {
-        const char *          (*get_name)      (ggml_backend_buffer_type_t buft);
+        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
        // allocate a buffer of this type
-        ggml_backend_buffer_t (*alloc_buffer)  (ggml_backend_buffer_type_t buft, size_t size);
+        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
        // tensor alignment
-        size_t                (*get_alignment) (ggml_backend_buffer_type_t buft);
+        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft);
-        // (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
+        // max buffer size that can be allocated
-        size_t                (*get_max_size)  (ggml_backend_buffer_type_t buft);
+        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft);
-        // (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
+        // data size needed to allocate the tensor, including padding
-        size_t                (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
+        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
-        // (optional) check if tensor data is in host memory (defaults to false)
+        // check if tensor data is in host memory
-        bool                  (*is_host)       (ggml_backend_buffer_type_t buft);
+        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
    };
    struct ggml_backend_buffer_type {
        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_dev_t device;
+        ggml_backend_buffer_type_context_t context;
        void * context;
    };
-    //
+    // buffer
-    // Backend buffer
+    typedef void * ggml_backend_buffer_context_t;
    //
    struct ggml_backend_buffer_i {
-        const char * (*get_name)     (ggml_backend_buffer_t buffer);
+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
-        // (optional) free the buffer
+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
-        void         (*free_buffer)  (ggml_backend_buffer_t buffer);
+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
-        // base address of the buffer
+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void *       (*get_base)     (ggml_backend_buffer_t buffer);
+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        // (optional) initialize a tensor in the buffer (eg. add tensor extras)
+        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        void         (*init_tensor)  (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
-        // tensor data access
+        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
-        void         (*memset_tensor)(ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor,     uint8_t value, size_t offset, size_t size);
+        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
        void         (*set_tensor)   (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void         (*get_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        // (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
        bool         (*cpy_tensor)   (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
        // clear the entire buffer
        void         (*clear)        (ggml_backend_buffer_t buffer, uint8_t value);
        // (optional) reset any internal state due to tensor initialization, such as tensor extras
        void         (*reset)        (ggml_backend_buffer_t buffer);
    };
    struct ggml_backend_buffer {
        struct ggml_backend_buffer_i  iface;
        ggml_backend_buffer_type_t    buft;
-        void * context;
+        ggml_backend_buffer_context_t context;
        size_t size;
        enum ggml_backend_buffer_usage usage;
    };
-    ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t buft,
+                   ggml_backend_buffer_type_t      buft,
-            struct ggml_backend_buffer_i      iface,
+            struct ggml_backend_buffer_i           iface,
-                   void *                     context,
+                   ggml_backend_buffer_context_t   context,
-                   size_t                     size);
+                   size_t                          size);
    // do not use directly, use ggml_backend_tensor_copy instead
    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
    // multi-buffer
    // buffer that contains a collection of buffers
-    ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
-    bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
-    void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    //
-    // Backend (stream)
+    // Backend
    //
    typedef void * ggml_backend_context_t;
    struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
-        void (*free)(ggml_backend_t backend);
+        void (*GGML_CALL free)(ggml_backend_t backend);
        // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
        // (optional) asynchronous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
+        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
        // (optional) complete all pending operations
-        void (*synchronize)(ggml_backend_t backend);
+        void (*GGML_CALL synchronize)(ggml_backend_t backend);
-        // (optional) compute graph with a plan (not used currently)
+        // compute graph with a plan (not used currently)
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
+        // create a new plan for a graph
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
-        void                      (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
+        void                      (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
        // compute the graph with the plan
-        enum ggml_status          (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        enum ggml_status          (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        // compute graph (always async if supported by the backend)
+        // compute graph without a plan (async)
-        enum ggml_status          (*graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
        //            new backends should implement the device interface instead
        // These functions are being moved to the device interface
        // check if the backend can compute an operation
-        bool (*supports_op)  (ggml_backend_t backend, const struct ggml_tensor * op);
+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // check if the backend can use tensors allocated in a buffer type
-        bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+        bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
        // these should be expensive operations with large batch sizes that may benefit from running on this backend
        // even if the weight has to be copied from the CPU temporarily
-        bool (*offload_op)   (ggml_backend_t backend, const struct ggml_tensor * op);
+        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // (optional) event synchronization
-        // record an event on this stream
+        // create a new event that can record events on this backend instance
-        void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
+        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
-        // wait for an event on on a different stream
+        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
-        void (*event_wait)  (ggml_backend_t backend, ggml_backend_event_t event);
+        // record an event on the backend instance that created it
        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
        // wait for an event on on a different backend instance
        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
        // block until an event is recorded
        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
    };
    struct ggml_backend {
        ggml_guid_t guid;
        struct ggml_backend_i iface;
-        ggml_backend_dev_t device;
+        ggml_backend_context_t context;
        void * context;
    };
    struct ggml_backend_event {
-        struct ggml_backend_device * device;
+        ggml_backend_t backend;
        void * context;
    };
    //
-    // Backend device
+    // Backend registry
    //
-    // Note: if additional properties are needed, we should add a struct with all of them
+    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
    //       the current functions to obtain the properties can remain, since they are more convenient for often used properties
    struct ggml_backend_device_i {
        // device name: short identifier for this device, such as "CPU" or "CUDA0"
        const char * (*get_name)(ggml_backend_dev_t dev);
-        // device description: short informative description of the device, could be the model name
+    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
        const char * (*get_description)(ggml_backend_dev_t dev);
        // device memory in bytes
        void         (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
        // device type
        enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
        // device properties
        void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
        // backend (stream) initialization
        ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
        // preferred buffer type
        ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
        // (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
        ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
        // (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
        ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
        // check if the backend can compute an operation
        bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
        // check if the backend can use tensors allocated in a buffer type
        bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
        // these should be expensive operations with large batch sizes that may benefit from running on this backend
        // even if the weight has to be copied from the CPU temporarily
        bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
        // (optional) event synchronization
        ggml_backend_event_t (*event_new)         (ggml_backend_dev_t dev);
        void                 (*event_free)        (ggml_backend_dev_t dev, ggml_backend_event_t event);
        void                 (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
    };
    struct ggml_backend_device {
        struct ggml_backend_device_i iface;
        ggml_backend_reg_t reg;
        void * context;
    };
    //
    // Backend (reg)
    //
    struct ggml_backend_reg_i {
        const char * (*get_name)(ggml_backend_reg_t reg);
        // enumerate available devices
        size_t             (*get_device_count)(ggml_backend_reg_t reg);
        ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
        // (optional) get a pointer to a function in the backend
        // backends can add custom functions that are not part of the standard ggml-backend interface
        void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
    };
    struct ggml_backend_reg {
        // int api_version; // TODO: for dynamic loading
        struct ggml_backend_reg_i iface;
        void * context;
    };
    // Internal backend registry API
    void ggml_backend_register(ggml_backend_reg_t reg);
    void ggml_backend_device_register(ggml_backend_dev_t device);
    // TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
    // typedef ggml_backend_register_t * (*ggml_backend_init)(void);
 #ifdef  __cplusplus
 }
--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@ -1,4 +1,3 @@
 #include "ggml-impl.h"
 #include "ggml-blas.h"
 #include "ggml-backend-impl.h"
@ -9,12 +8,11 @@
 #   include <Accelerate/Accelerate.h>
 #elif defined(GGML_BLAS_USE_MKL)
 #   include <mkl.h>
 #elif defined(GGML_BLAS_USE_BLIS)
 #   include <blis.h>
 #elif defined(GGML_BLAS_USE_NVPL)
 #   include <nvpl_blas.h>
 #else
 #   include <cblas.h>
 #   ifdef BLIS_ENABLE_CBLAS
 #       include <blis.h>
 #   endif
 #endif
 struct ggml_backend_blas_context {
@ -142,14 +140,10 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
    openblas_set_num_threads(ctx->n_threads);
 #endif
-#if defined(GGML_BLAS_USE_BLIS)
+#if defined(BLIS_ENABLE_CBLAS)
    bli_thread_set_num_threads(ctx->n_threads);
 #endif
 #if defined(GGML_BLAS_USE_NVPL)
    nvpl_blas_set_num_threads(ctx->n_threads);
 #endif
    for (int64_t i13 = 0; i13 < ne13; i13++) {
        for (int64_t i12 = 0; i12 < ne12; i12++) {
            const int64_t i03 = i13/r3;
@ -235,25 +229,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
 // backend interface
-static const char * ggml_backend_blas_name(ggml_backend_t backend) {
+GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
    return "BLAS";
    GGML_UNUSED(backend);
 }
-static void ggml_backend_blas_free(ggml_backend_t backend) {
+GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
    delete ctx;
    delete backend;
 }
-static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
    return ggml_backend_cpu_buffer_type();
    GGML_UNUSED(backend);
 }
-static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
    ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
    for (int i = 0; i < cgraph->n_nodes; i++) {
@ -276,7 +270,8 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
                break;
            default:
-                GGML_ABORT("%s: unsupported op %s\n", __func__, ggml_op_desc(node));
+                fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
                GGML_ASSERT(false);
        }
    }
@ -285,7 +280,7 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
    GGML_UNUSED(backend);
 }
-static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
    const struct ggml_tensor * src0 = op->src[0];
    const struct ggml_tensor * src1 = op->src[1];
@ -300,7 +295,7 @@ static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct g
    GGML_UNUSED(backend);
 }
-static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
    return ggml_backend_buft_is_host(buft);
    GGML_UNUSED(backend);
@ -322,8 +317,11 @@ static struct ggml_backend_i blas_backend_i = {
    /* .supports_op             = */ ggml_backend_blas_supports_op,
    /* .supports_buft           = */ ggml_backend_blas_supports_buft,
    /* .offload_op              = */ NULL,
    /* .event_new               = */ NULL,
    /* .event_free              = */ NULL,
    /* .event_record            = */ NULL,
    /* .event_wait              = */ NULL,
    /* .event_synchronize       = */ NULL,
 };
 static ggml_guid_t ggml_backend_blas_guid(void) {
@ -337,7 +335,6 @@ ggml_backend_t ggml_backend_blas_init(void) {
    ggml_backend_t backend = new ggml_backend {
        /* .guid      = */ ggml_backend_blas_guid(),
        /* .interface = */ blas_backend_i,
        /* .device    = */ nullptr,
        /* .context   = */ ctx,
    };
@ -354,7 +351,7 @@ ggml_backend_t ggml_backend_blas_init(void) {
    return backend;
 }
-bool ggml_backend_is_blas(ggml_backend_t backend) {
+GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
    return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
 }
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
--- a/ggml/src/ggml-cann/Doxyfile
+++ b/ggml/src/ggml-cann/Doxyfile
--- a/ggml/src/ggml-cann/acl_tensor.cpp
+++ b/ggml/src/ggml-cann/acl_tensor.cpp
@ -1,175 +0,0 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include "acl_tensor.h"
 #include <algorithm>
 #include <cstring>
 aclDataType ggml_cann_type_mapping(ggml_type type) {
    switch (type) {
        case GGML_TYPE_F32:
            return ACL_FLOAT;
        case GGML_TYPE_F16:
            return ACL_FLOAT16;
        case GGML_TYPE_I8:
            return ACL_INT8;
        case GGML_TYPE_I16:
            return ACL_INT16;
        case GGML_TYPE_I32:
            return ACL_INT32;
        case GGML_TYPE_Q4_0:
            return ACL_INT4;
        case GGML_TYPE_Q8_0:
            return ACL_INT8;
        default:
            return ACL_DT_UNDEFINED;
    }
    return ACL_DT_UNDEFINED;
 }
 aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne,
                                   size_t* nb, int64_t dims, aclFormat format,
                                   size_t offset) {
    // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be
    // added.
    int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2];
    int64_t acl_storage_len = 0;
    if (ne == nullptr) {
        acl_storage_len = ggml_nbytes(tensor);
        for (int i = 0; i < GGML_MAX_DIMS; i++) {
            acl_ne[i] = tensor->ne[i];
            // The step size of acl is in elements.
            acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor);
        }
    } else {
        // With bcast
        for (int i = 0; i < dims; i++) {
            acl_storage_len += (ne[i] - 1) * nb[i];
            acl_ne[i] = ne[i];
            acl_stride[i] = nb[i] / ggml_element_size(tensor);
        }
    }
    // Reverse ne and stride.
    int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims);
    std::reverse(acl_ne, acl_ne + final_dims);
    std::reverse(acl_stride, acl_stride + final_dims);
    aclTensor* acl_tensor = aclCreateTensor(
        acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride,
        offset / ggml_element_size(tensor), format, &acl_storage_len, 1,
        tensor->data);
    return acl_tensor;
 }
 bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) {
            return true;
        }
    }
    return false;
 }
 int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
                                  const ggml_tensor* src1,
                                  int64_t* bcast_src0_ne,
                                  int64_t* bcast_src1_ne, size_t* bcast_src0_nb,
                                  size_t* bcast_src1_nb) {
    GGML_ASSERT(ggml_can_repeat(src1, src0));
    int bcast_dim_cnt = 0;
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        int64_t nr = src0->ne[i] / src1->ne[i];
        bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr;
        bcast_src1_ne[bcast_dim_cnt] = src1->ne[i];
        bcast_src0_nb[bcast_dim_cnt] = src0->nb[i];
        bcast_src1_nb[bcast_dim_cnt] = src1->nb[i];
        bcast_dim_cnt++;
        if (nr != 1) {
            // Need to add an extra dim.
            bcast_src0_ne[bcast_dim_cnt] = nr;
            bcast_src1_ne[bcast_dim_cnt] = 1;
            bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] *
                                           bcast_src0_ne[bcast_dim_cnt - 1];
            bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] *
                                           bcast_src1_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
    return bcast_dim_cnt;
 }
 int64_t ggml_cann_get_mulmat_bcast_shape(
    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) {
    // input and dst shoule in same shape, except first two dims.
    GGML_ASSERT(input_ne[2] == dst_ne[2]);
    GGML_ASSERT(input_ne[3] == dst_ne[3]);
    int bcast_dim_cnt = 0;
    // For mul_mat, a dimension needs to be added before the dimension that
    // weight needs to be expanded to satisfy the bcast rule of matrix
    // multiplication.
    for (int i = 0; i < GGML_MAX_DIMS; i++) {
        int64_t nr = input_ne[i] / weight_ne[i];
        // Do not use bcast in the first two dimensions because we only support
        // the bcast batch dimension. Just copy them.
        if (i < 2 || nr == 1) {
            bcast_input_ne[bcast_dim_cnt] = input_ne[i];
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i];
            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
            bcast_dim_cnt++;
        } else {
            // Need to add an extra dim.
            bcast_input_ne[bcast_dim_cnt] = nr;
            bcast_dst_ne[bcast_dim_cnt] = nr;
            bcast_weight_ne[bcast_dim_cnt] = 1;
            bcast_input_nb[bcast_dim_cnt] = input_nb[i];
            bcast_dst_nb[bcast_dim_cnt] = dst_nb[i];
            bcast_weight_nb[bcast_dim_cnt] = weight_nb[i];
            bcast_dim_cnt++;
            bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr;
            bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr;
            bcast_weight_ne[bcast_dim_cnt] = weight_ne[i];
            bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] *
                                            bcast_input_ne[bcast_dim_cnt - 1];
            bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] *
                                          bcast_dst_ne[bcast_dim_cnt - 1];
            bcast_weight_nb[bcast_dim_cnt] =
                bcast_weight_nb[bcast_dim_cnt - 1] *
                bcast_weight_ne[bcast_dim_cnt - 1];
            bcast_dim_cnt++;
        }
    }
    return bcast_dim_cnt;
 }
--- a/ggml/src/ggml-cann/acl_tensor.h
+++ b/ggml/src/ggml-cann/acl_tensor.h
@ -1,258 +0,0 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #ifndef CANN_ACL_TENSOR_H
 #define CANN_ACL_TENSOR_H
 #include <algorithm>
 #include <cstring>
 #include <aclnn/aclnn_base.h>
 #include "common.h"
 /**
 * @brief	Maps a ggml_type to its corresponding aclDataType.
 *
 * @details	This function takes a ggml_type as input and returns the corresponding
 *			aclDataType. It supports mapping for various ggml_types. If the input type
 *			does not match any of the predefined ggml_types, the function returns
 *          ACL_DT_UNDEFINED.
 *
 * @param	type    The ggml_type to be mapped.
 * @return	The corresponding aclDataType. If the input type is not recognized,
 *			ACL_DT_UNDEFINED is returned.
 */
 aclDataType ggml_cann_type_mapping(ggml_type type);
 /**
 * @brief   Creates an ACL tensor from a ggml_tensor with optional shape.
 *
 * @details This function creates an ACL tensor based on the properties of the
 *          provided ggml_tensor. It supports customer shape by adjusting dimensions
 *          and strides accordingly. If customer shape is applied, additional
 *          dimensions and strides are calculated based on the provided parameters.
 *
 * @param   tensor      Pointer to the ggml_tensor to be converted to ACL tensor.
 * @param   ne          Pointer to an array containing dimensions. Defaults to nullptr
 *                      if no customer shape is applied.
 * @param   nb          Pointer to an array containing strides. Defaults to nullptr
 *                      if no customer shape is applied.
 * @param   dims        Number of dimensions in the tensor. Defaults to 0 if no customer
 *                      shape is applied.
 * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
 aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr,
                             size_t* nb = nullptr, int64_t dims = 0,
                             aclFormat format = ACL_FORMAT_ND,
                             size_t offset = 0);
 /**
 * @brief   Template for creating an ACL tensor from provided parameters. typename TYPE
 *          should be size_t or float.
 *
 * @details This function creates an ACL tensor using the provided data pointer,
 *          data type, dimensions, strides, format, offset, and additional parameters.
 *          It calculates necessary dimensions and strides based on the provided ne and nb
 *          arrays, adjusting them for the ACL tensor creation. The ACL storage length
 *          is also calculated based on the provided dimensions and strides.
 *
 * @param   data_ptr    Pointer to the data buffer for the ACL tensor.
 * @param   dtype       ACL data type of the tensor.
 * @param   type_size   Size of each element in the tensor data buffer.
 * @param   ne          Pointer to an array containing tensor dimensions.
 * @param   nb          Pointer to an array containing tensor strides.
 * @param   dims        Number of dimensions of the tensor.
 * @param   format      ACL tensor format. Defaults to ACL_FORMAT_ND.
 * @param   offset      Offset in bytes for the ACL tensor data. Defaults to 0.
 * @return  Pointer to the created ACL tensor.
 */
 template<typename TYPE>
 aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
                                   TYPE type_size, int64_t* ne, TYPE* nb,
                                   int64_t dims,
                                   aclFormat format = ACL_FORMAT_ND,
                                   size_t offset = 0) {
    int64_t tmp_ne[GGML_MAX_DIMS * 2];
    int64_t tmp_stride[GGML_MAX_DIMS * 2];
    memcpy(tmp_ne, ne, dims * sizeof(int64_t));
    for (int i = 0; i < dims; i++) {
        tmp_stride[i] = nb[i] / type_size;
    }
    std::reverse(tmp_ne, tmp_ne + dims);
    std::reverse(tmp_stride, tmp_stride + dims);
    int64_t acl_storage_len = 0;
    for (int i = 0; i < dims; i++) {
        acl_storage_len += (ne[i] - 1) * nb[i];
    }
    aclTensor* acl_tensor =
        aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
                        format, &acl_storage_len, 1, data_ptr);
    return acl_tensor;
 }
 /**
 * @brief   Checks if tensors require broadcasting based on their shapes.
 *
 * @details This function determines if two ggml_tensors need to be broadcasted for
 *          element-wise operations. Broadcasting is necessary if the shapes of the
 *          tensors are not identical and no dimension in either tensor equals 1.
 *
 * @param   t0      Pointer to the first ggml_tensor.
 * @param   t1      Pointer to the second ggml_tensor.
 * @return  True if broadcasting is needed, False otherwise.
 *
 * @remarks This function iterates over the dimensions of t0 and t1. It checks if each
 *          dimension in t1 differs from t0's corresponding dimension and is not equal
 *          to 1. If such a dimension is found, broadcasting is required to align t1
 *          with t0 for element-wise operations.
 */
 bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1);
 /**
 * @brief   Computes broadcast shapes and strides for two ggml_tensors.
 *
 * @details This function calculates the broadcast shapes and strides for two ggml_tensors,
 *          following the broadcasting rules similar to numpy. It adjusts dimensions and
 *          strides to ensure compatibility for element-wise operations where one tensor
 *          can be broadcasted to match the shape of another tensor.
 *
 * @param   src0                Pointer to the first ggml_tensor.
 * @param   src1                Pointer to the second ggml_tensor.
 * @param   bcast_ne_src0       Output array to store broadcasted dimensions for src0.
 * @param   bcast_ne_src1       Output array to store broadcasted dimensions for src1.
 * @param   bcast_nb_src0       Output array to store broadcasted strides for src0.
 * @param   bcast_nb_src1       Output array to store broadcasted strides for src1.
 * @return  Number of dimensions in the broadcasted shape.
 *
 * @pre     ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted
 *          to match src0.
 *
 * @remarks This function iterates over the dimensions of src0 and src1, calculating the
 *          necessary broadcast dimensions and strides. If a dimension requires broadcasting
 *          (i.e., its size in src1 is smaller than in src0), an additional dimension is
 *          added with size calculated to match src0's dimension. This adjustment ensures
 *          that src1 can be element-wise broadcasted to src0's shape.
 *
 *  How it works:
 *
 *  if dim0 has padding.
 *  a -> (2, 2) padding = 2
 *   a: [[1, 2, *, *]
 *       [2, 3, *, *]]
 *  nb = (8, 4, 2)
 *
 *  if a should bcast with b -> (2, 4)
 *  b' -> (2, 2, 2)
 *  b : [[1, 2, 3, 4, *, *]
 *       [5, 6, 7, 8, *, *]]
 *  nb = (12, 6, 1)
 *
 *  after bcast:
 *  a' -> (2, 1, 2)
 *  a': [[[1, 2], *, *]
 *       [[2, 3], *, *]]
 *  nb = (8, 4, 2, 1)
 *
 *  b' : [[[1, 2], [3, 4], *, *]
 *        [[5, 6], [7, 8], *, *]]
 *  nb = (12, 6, 2, 1)
 *  \endcode
 *
 *  dim1 in a inserted dim, should add nb for dim1,
 *  and all other nb moves to next in order.
 */
 int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1,
                        int64_t* bcast_ne_src0, int64_t* bcast_ne_src1,
                        size_t* bcast_nb_src0, size_t* bcast_nb_src1);
 // Bcast macro to avoid duplicate code.
 #define BCAST_SHAPE(src0, src1)                                              \
    int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2];                            \
    int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2];                            \
    size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2];                             \
    size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2];                             \
    int64_t bcast_dims = ggml_cann_get_bcast_shape(                          \
        src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \
        bcast_##src1##_nb);
 #define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
 /**
 * @brief Calculates broadcast shapes for matrix multiplication.
 *
 * @details This function computes the broadcast shapes required for matrix multiplication
 *          based on the input, weight, and destination tensor shapes. It ensures that the
 *          dimensions of weight tensors are expanded appropriately to satisfy matrix
 *          multiplication broadcast rules.
 *
 * @param input_ne      Array containing the dimensions of the input tensor.
 * @param weight_ne     Array containing the dimensions of the weight tensor.
 * @param dst_ne        Array containing the dimensions of the destination tensor.
 * @param input_nb      Array containing the strides of the input tensor.
 * @param weight_nb     Array containing the strides of the weight tensor.
 * @param dst_nb        Array containing the strides of the destination tensor.
 * @param bcast_input_ne    Output array for broadcasted input tensor dimensions.
 * @param bcast_weight_ne   Output array for broadcasted weight tensor dimensions.
 * @param bcast_dst_ne      Output array for broadcasted destination tensor dimensions.
 * @param bcast_input_nb    Output array for broadcasted input tensor strides.
 * @param bcast_weight_nb   Output array for broadcasted weight tensor strides.
 * @param bcast_dst_nb      Output array for broadcasted destination tensor strides.
 * @return The number of dimensions in the broadcasted tensors.
 *
 * @remarks This function iterates over the tensor dimensions and calculates the broadcast
 *          shapes needed for matrix multiplication. It ensures that dimensions where
 *          weight tensor requires expansion are appropriately handled to conform with
 *          broadcasting rules.
 * @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim
 *       before cast dim.
 * @sa ggml_cann_get_bcast_shape
 */
 int64_t ggml_cann_get_mulmat_bcast_shape(
    const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne,
    const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb,
    int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne,
    size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb);
 // Bcast macro to avoid duplicate code.
 #define BCAST_MUL_MAT_SHAPE(input, weight, dst)                         \
    int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2];                      \
    int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2];                     \
    int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2];                        \
    size_t bcast_##input##_nb[GGML_MAX_DIMS * 2];                       \
    size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2];                      \
    size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2];                         \
    int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape(              \
        input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \
        bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne,      \
        bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb);
 #define BCAST_MUL_MAT_PARAM(tensor) \
    bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims
 #endif  // CANN_ACL_TENSOR_H
--- a/ggml/src/ggml-cann/aclnn_ops.cpp
+++ b/ggml/src/ggml-cann/aclnn_ops.cpp
--- a/ggml/src/ggml-cann/aclnn_ops.h
+++ b/ggml/src/ggml-cann/aclnn_ops.h
@ -1,592 +0,0 @@
 #ifndef CANN_ACLNN_OPS
 #define CANN_ACLNN_OPS
 /**
 * @file    acl_tensor
 * @brief   This file contains related functions of ggml_tensor and acl_tensor.
 *          Contains conversion from ggml_tensor to acl_tensor, broadcast and other
 *          functions.
 * @author  hipudding <huafengchun@gmail.com>
 * @author  wangshuai09 <391746016@qq.com>
 * @date    July 15, 2024
 *
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #include <aclnnop/aclnn_add.h>
 #include <aclnnop/aclnn_arange.h>
 #include <aclnnop/aclnn_argsort.h>
 #include <aclnnop/aclnn_cat.h>
 #include <aclnnop/aclnn_clamp.h>
 #include <aclnnop/aclnn_div.h>
 #include <aclnnop/aclnn_gelu.h>
 #include <aclnnop/aclnn_hardsigmoid.h>
 #include <aclnnop/aclnn_hardswish.h>
 #include <aclnnop/aclnn_leaky_relu.h>
 #include <aclnnop/aclnn_mul.h>
 #include <aclnnop/aclnn_relu.h>
 #include <aclnnop/aclnn_silu.h>
 #include <aclnnop/aclnn_tanh.h>
 #include "acl_tensor.h"
 #include "common.h"
 /**
 * @brief   Repeats a ggml tensor along each dimension to match the dimensions
 *          of another tensor.
 *
 * @details This function repeats the elements of a source ggml tensor along
 *          each dimension to create a destination tensor with the specified
 *          dimensions. The operation is performed using the ACL backend and
 *          executed asynchronously on the device.
 *
 * @param   ctx The CANN context used for operations.
 * @param   dst The ggml tensor representing the destination, which op is
 *              GGML_OP_REPEAT and specifies the desired dimensions.
 */
 void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Adds two ggml tensors using the CANN backend.
 *
 * @details This function performs an element-wise addition of two tensors. In
 *          case the tensors do not have the same shape, one or both tensors
 *          will be broadcasted to match the shape of the other before the
 *          addition is performed.The formula for the operation is given by:
 *          \f[
 *              \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1}
 *          \f]
 *
 * @param ctx The CANN context used for operations.
 * @param dst The ggml tensor representing the destination, result of the
 *            addition is stored at dst->data, and dst->op is `GGML_OP_ADD`
 */
 void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies the Leaky ReLU activation function to a tensor using the CANN
 *          backend.
 *
 * @details This function computes the Leaky ReLU activation for each element of
 *          the input tensor. The Leaky ReLU function allows a small gradient
 *          when the unit is not active (i.e., when the input is negative). The
 *          Leaky ReLU function is defined as:
 *          \f[
 *              \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0,
 *               src)
 *          \f]
 *          `negativeSlope` is in dst->params.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the result of the Leaky ReLU
 *            activation is stored, which op is `GGML_OP_LEAKY_RELU`
 */
 void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief    Concatenates multiple tensors along a specified dimension using the
 *           CANN backend.
 *
 * @param ctx        The CANN context used for operations.
 * @param tensorList A pointer to the list of tensors to be concatenated.
 * @param dst        The destination tensor where the result of the
 *                   concatenation is stored. dst->op is `GGML_OP_CONCAT`.
 * @param concat_dim The dimension along which the tensors are concatenated.
 *
 * @attention tensorList length should be 2 and the dimension using for concat
 *            default to 1.
 */
 void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Generates a sequence of evenly spaced values within a specified
 *          interval for a ggml tensor using the CANN backend.
 *
 * @details This function creates a sequence of numbers over a specified i
 *          nterval, starting from `start`, ending before `stop`, and
 *          incrementing by `step`. The sequence is stored in the destination
 *          tensor `dst`.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the generated sequence will be stored.
 *            `start`, 'stop' and 'step' are in dst->op_params and dst->op is
 *            `GGML_OP_ARANGE`.
 */
 void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the square of the elements of a ggml tensor using the CANN
 *          backend.
 * @details The function sets the second source tensor of the destination
 *          tensor `dst` to be equal to the first source tensor. This is
 *          effectively squaring the elements since the multiplication becomes
 *          `element * element`.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the squared values will be stored，
 *            which dst->op is `GGML_OP_SQR`.
 */
 void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies a clamp operation to the elements of a ggml tensor using the
 *          CANN backend.
 *
 * @details This function clamps the elements of the input tensor `src` to a
 *          specified range defined by `min` and `max` values. The result is
 *          stored in the destination tensor `dst`. The operation is defined as:
 *          \f[
 *              y = \max(\min(x, max\_value), min\_value)
 *           \f]
 *          where `x` is an element of the input tensor, and `y` is the
 *          corresponding element in the output tensor.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the clamped values will be stored.
 *            dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params.
 */
 void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Scales the elements of a ggml tensor by a constant factor using the
 *          CANN backend.
 *
 * @details This function multiplies each element of the input tensor `src` by
 *          a scaling factor `scale`, storing the result in the destination
 *          tensor `dst`. The operation is defined as:
 *          \f[
 *             dst = src \times scale
 *          \f]
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the scaled values will be stored.
 *            dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params.
 */
 void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Sorts the elements of a ggml tensor and returns the indices that
 *          would sort the tensor using the CANN backend.
 *
 * @details This function performs an argsort operation on the input tensor
 *          `src`. It sorts the elements of `src` in either ascending or
 *          descending order, depending on the `GGML_SORT_ORDER_DESC`,
 *          and returns the indices that would sort the original tensor.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the sorted indices will be stored.
 *            dst->op is `GGML_OP_ARGSORT`.
 */
 void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the Layer Normalization for a ggml tensor using the CANN
 *          backend.
 *
 * @details This function applies the Layer Normalization operation on the
 *          input tensor `src` and stores the result in the destination tensor
 *          `dst`. Layer Normalization normalizes the features at each sample in
 *          a mini-batch independently. It is commonly used in neural networks
 *          to normalize the activations of a layer by adjusting and scaling
 *          the outputs.
 *          The operation is defined as:
 *          \f[
 *              \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
 *          \f]
 *          `Var` defaults dst->ne[0]. `eps` is in dst->params.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the normalized values will be stored.
 * @attention `Var` defaults to dst->ne[0].
 */
 void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief  Computes the Group Normalization for a ggml tensor using the CANN
 *         backend.
 *
 * @brief  This function applies the Group Normalization operation on the input
 *         tensor `src` and stores the result in the destination tensor `dst`.
 *         Group Normalization divides the channels into groups and normalizes
 *         the features within each group across spatial locations.
 *         It is commonly used in convolutional neural networks to improve
 *         training stability and performance.
 *         The operation is defined as:
 *         \f[
 *             \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}}
 *         \f]
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the normalized values will be stored.
 *            `n_groups` is in dst->params, which split C channel to `n_groups`.
 *            dst->op is `GGML_OP_GROUP_NORM`.
 *
 * @attention eps defaults to 1e-6f.
 */
 void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the accumulation of tensors using the CANN backend.
 *
 * @details This function performs an accumulation operation on two tensors.
 *          Depending on the `inplace` flag, it either updates the destination
 *          tensor `dst` in place by adding `alpha * src1` to it, or it creates
 *          a new tensor as the result of `src0 + alpha * src1` and stores it in
 *          `dst`.
 *          The operation is defined as:
 *          \f[
 *               dst = src0 + alpha \times src1
 *          \f]
 *          if `inplace` is `true`, `src0` is equal to 'dst'.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the accumulated values will be stored.
 *            `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`.
 */
 void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the sum of elements along the last dimension of a ggml tensor
 *          using the CANN backend.
 *
 * @details This function performs a reduction sum operation along the last
 *          dimension of the input tensor `src`. The result of the sum is stored
 *          in the destination tensor `dst`.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the reduced values will be stored。
 *            dst->op is `GGML_OP_SUM_ROWS`.
 *
 * @attention `reduce_dims` defaults to 3, which means the last dimension.
 */
 void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Upsamples a ggml tensor using nearest neighbor interpolation using
 *          the CANN backend.
 *
 * @details This function performs upsampling of the input tensor `src` using
 *          nearest neighbor interpolation. The upsampling is applied to the
 *          height and width dimensions (last two dimensions) of the tensor. The
 *          result is stored in the destination tensor `dst`, which must have
 *          the appropriate dimensions for the upsampled output.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the upsampled values will be stored.
 *            dst->op is `GGML_OP_UPSCALE`.
 */
 void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
                                  ggml_tensor* dst);
 /**
 * @brief   Pads a ggml tensor to match the dimensions of the destination tensor
 *          using the CANN backend.
 *
 * @details This function pads the input tensor `src` so that it matches the
 *          dimensions of the destination tensor `dst`. The amount of padding
 *          is calculated based on the difference in sizes between `src` and
 *          `dst` along each dimension. The padded tensor is stored in `dst`.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor, which specifies the target dimensions for
 *            padding. dst->op is `GGML_OP_PAD`.
 */
 void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Executes a 2D pooling operation on a ggml tensor using the CANN
 *          backend.
 *
 * @details This function dispatches the execution of a 2D pooling operation on
 *          the input tensor `dst`. The type of pooling (average or max) is
 *          determined by the `op` parameter, which is read from the operation
 *          parameters of `dst`. The function supports average pooling
 *          (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an
 *          invalid operation is encountered, the function asserts a failure.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor on which the pooling operation is to be
 *            performed. dst->op is `GGML_OP_POOL_2D`.
 */
 void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Duplicates a ggml tensor using the CANN backend.
 *
 * @details This function duplicates the contents of the source tensor `src` to
 *          the destination tensor `dst`. The function supports various tensor
 *          types and configurations, including handling of extra data, type
 *          conversions, and special cases for contiguous and non-contiguous
 *          tensors.
 *
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the duplicated data will be stored.
 *            dst->op is `GGML_OP_DUP`
 *
 * @attention Only support Fp16/FP32. Not support when src and dst have
 *            different shape and dst is no-contiguous.
 * @note:     This func need to simplify.
 */
 void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the Root Mean Square (RMS) normalization of a ggml tensor
 *          using the CANN backend.
 *
 * @details This function applies RMS normalization to the input tensor `src`
 *          and stores the result in the destination tensor `dst`. RMS
 *          normalization involves computing the root mean square of the input
 *          tensor along a specified dimension and then dividing each element of
 *          the tensor by this value, adjusted by a small epsilon value to
 *          prevent division by zero.
 *          The operation is defined as:
 *          \f[
 *               \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i,
 *               \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s}
 *          \f]
 *          `eps` is in dst->op_params.
 * @param ctx The CANN context used for operations.
 * @param dst The destination tensor where the normalized values will be stored.
 *            dst->op is `GGML_OP_RMS_NORM`.
 */
 void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Applies a diagonal mask to the tensor with a specified value.
 *
 * @details This function creates a mask tensor filled with ones, then applies
 *          an upper triangular and lower triangular operation to it based on
 *          the number of past elements specified. Afterward, it adds the masked
 *          tensor to the destination tensor in-place.
 *
 * @param ctx The backend CANN context used for operations.
 * @param dst The destination tensor where the result will be stored. dst->op is
 *            `GGML_OP_DIAG_MASK`
 * @param value The value to use for masking.
 */
 void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
 /**
 * @brief   Performs an image-to-column transformation on the input tensor.
 *
 * @details This function takes an input tensor and applies an image-to-column
 *          operation, converting spatial dimensions into column-like
 *          structures suitable for convolutional operations. It supports both
 *          half-precision (F16) and single-precision (F32) floating-point data
 *          types.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor that stores the result of the operation.
 *            dst->op is `GGML_OP_IM2COL`.
 */
 void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes time step embeddings using sine and cosine functions.
 *
 * @details This function calculates time step embeddings by applying sine and
 *          cosine transformations to a given input tensor, which is typically
 *          used in temporal models like diffusion models or transformers to
 *          encode time information effectively.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the result of the embedding operation
 *            will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`.
 */
 void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 // @see ggml_cann_dup.
 void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Computes the softmax activation with optional masking.
 *
 * @details This function computes the softmax activation over the input tensor,
 *          optionally applying a mask and scaling factor. It supports both FP16
 *          and FP32 data types and can handle masking by broadcasting the mask
 *          across rows if necessary.
 *          The function performs the following steps:
 *          1. Multiplies the input tensor by a scale factor.
 *          2. Optionally casts the mask tensor to FP32 if it is in FP16 format.
 *          3. Broadcasts the mask tensor if its dimensions do not match the
 *             input tensor's dimensions.
 *          4. Adds the mask to the scaled input tensor.
 *          5. Applies the softmax activation function along the specified
 *             dimension.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the result will be stored. dst->op is
 *            `GGML_OP_SOFTMAX`.
 */
 void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Extracts specific rows from a tensor based on indices.
 *
 * @details This function retrieves rows from a source tensor src0 according to
 *          the indices provided in another tensor src1 and stores the result in
 *          a destination tensor (\p dst). It supports different data types
 *          including F32, F16, Q4_0, and Q8_0.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the extracted rows will be stored.
 *            dst->op is `GGML_OP_GET_ROWS`.
 */
 void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief   Executes matrix multiplication for the given tensor.
 *
 * @details This function performs matrix multiplication on the source tensors
 *          associated with the destination tensor. It supports matrix
 *          multiplication F32, F16, and Q8_0.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor for storing the result of the matrix
 *            multiplication. dst->op is `GGML_OP_MUL_MAT`.
 */
 void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 /**
 * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor.
 *
 * @details This function implements the RoPE mechanism, which is a method to
 *          encode positional information into sequence data, particularly
 *          useful in transformer models. It supports both F32 and F16 data
 *          types.
 *
 * @param ctx The backend CANN context for executing operations.
 * @param dst The destination tensor where the RoPE-transformed data will be
 *            stored. dst->op is `GGML_OP_ROPE`.
 *
 * @note The function currently does not support cases where the n_dims is less
 *       than the input tensor's first dimension.
 * @note The function currently does not support cases where the freq_factors is
 *       not NULL.
 * @note The function currently does not support cases where the ext_factor is
 *       not equal 0.
 * @note The function currently does not support cases where the freq_scale is
 *       not equal 1.
 */
 void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst);
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                       aclTensor*, uint64_t*, aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
 void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src0 = dst->src[0];
    ggml_tensor* src1 = dst->src[1];
    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
    aclTensor* acl_src0;
    aclTensor* acl_src1;
    aclTensor* acl_dst;
    // Need bcast
    if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) {
        BCAST_SHAPE(src0, src1)
        acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0));
        acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1));
        acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0));
    } else {
        acl_src0 = ggml_cann_create_tensor(src0);
        acl_src1 = ggml_cann_create_tensor(src1);
        acl_dst = ggml_cann_create_tensor(dst);
    }
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize,
                               &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src0));
    ACL_CHECK(aclDestroyTensor(acl_src1));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 // Activation functions template.
 template <aclnnStatus getWorkspaceSize(const aclTensor*, aclTensor*, uint64_t*,
                                       aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
                              const aclrtStream)>
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    GGML_ASSERT(src->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 // Activation functions template for const aclTensors.
 template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
                                       uint64_t*, aclOpExecutor**),
          aclnnStatus execute(void*, uint64_t, aclOpExecutor*,
                              const aclrtStream)>
 void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
    ggml_tensor* src = dst->src[0];
    GGML_ASSERT(src->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->type == GGML_TYPE_F32);
    aclTensor* acl_src = ggml_cann_create_tensor(src);
    aclTensor* acl_dst = ggml_cann_create_tensor(dst);
    uint64_t workspaceSize = 0;
    aclOpExecutor* executor;
    void* workspaceAddr = nullptr;
    ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor));
    if (workspaceSize > 0) {
        ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
        workspaceAddr = workspace_allocator.get();
    }
    aclrtStream main_stream = ctx.stream();
    ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream));
    ACL_CHECK(aclDestroyTensor(acl_src));
    ACL_CHECK(aclDestroyTensor(acl_dst));
 }
 #endif  // CANN_ACLNN_OPS
--- a/ggml/src/ggml-cann/common.h
+++ b/ggml/src/ggml-cann/common.h
@ -1,283 +0,0 @@
 /*
 * Copyright (c) 2023-2024 The ggml authors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 * IN THE SOFTWARE.
 */
 #ifndef CANN_COMMON_H
 #define CANN_COMMON_H
 #include <acl/acl.h>
 #include <cstdio>
 #include <iostream>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 #include "../include/ggml-cann.h"
 #include "../include/ggml.h"
 #define MATRIX_ROW_PADDING 512
 #define GGML_CANN_MAX_STREAMS 8
 /**
 * @brief Handles CANN-related errors by printing an error message and
 *        terminating the program.
 * @param stmt The statement that caused the error.
 * @param func The function in which the error occurred.
 * @param file The file in which the error occurred.
 * @param line The line number at which the error occurred.
 * @param msg The error message.
 */
 [[noreturn]] void ggml_cann_error(const char* stmt, const char* func,
                                  const char* file, int line, const char* msg);
 /**
 * @brief Checks the result of a CANN function call and invokes the error
 *        handler if the call fails.
 * @param stmt The CANN function call to check.
 * @param success The success code that indicates the call was successful.
 * @param error_fn The function to call to retrieve the error message.
 */
 #define ACL_CHECK_GEN(stmt, success, error_fn)                                \
    do {                                                                      \
        int err_code = (stmt);                                                \
        if (err_code != (success)) {                                          \
            ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \
        }                                                                     \
    } while (0);
 #define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg)
 /**
 * @brief Contains information about CANN devices.
 */
 struct ggml_cann_device_info {
    /**
     * @brief Number of CANN devices available.
     */
    int32_t device_count;
    /**
     * @brief Information about a single CANN device.
     */
    struct cann_device_info {
        int cc;                 /**< Compute capability.                   */
        size_t smpb;            /**< Maximum shared memory per block.      */
        bool vmm;               /**< Virtual memory support.               */
        size_t vmm_granularity; /**< Granularity of virtual memory.        */
        size_t total_vram;      /**< Total video RAM available on the device. */
    };
    cann_device_info devices[GGML_CANN_MAX_DEVICES] =
        {}; /**< Array of CANN device information. */
 };
 const ggml_cann_device_info& ggml_cann_info();
 void ggml_cann_set_device(int32_t device);
 int32_t ggml_cann_get_device();
 /**
 * @brief Abstract base class for memory pools used by CANN.
 */
 struct ggml_cann_pool {
    /**
     * @brief Virtual destructor for the memory pool.
     */
    virtual ~ggml_cann_pool() = default;
    /**
     * @brief Allocates memory from the pool.
     *
     * @param size         The size of the memory block to allocate.
     * @param actual_size  Pointer to a variable where the actual allocated size
     *                     will be stored.
     * @return             Pointer to the allocated memory block.
     */
    virtual void* alloc(size_t size, size_t* actual_size) = 0;
    /**
     * @brief Frees a previously allocated memory block.
     *
     * @param ptr   Pointer to the memory block to free.
     * @param size  Size of the memory block to free.
     * @note Note that all CANN opertors are running async. Make sure memory is
     *       still avaiable before this operator finished.
     */
    virtual void free(void* ptr, size_t size) = 0;
 };
 /**
 * @brief RAII wrapper for managing memory allocations from a CANN memory pool.
 */
 struct ggml_cann_pool_alloc {
    ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */
    void* ptr = nullptr;    /**< Pointer to the allocated memory block. */
    size_t actual_size = 0; /**< Actual size of the allocated memory block. */
    /**
     * @brief Default constructor.
     */
    ggml_cann_pool_alloc() = default;
    /**
     * @brief Constructor that initializes the memory pool.
     * @param pool Reference to the memory pool.
     */
    explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {}
    /**
     * @brief Constructor that initializes the memory pool and allocates memory.
     * @param pool Reference to the memory pool.
     * @param size Size of the memory block to allocate.
     */
    ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) {
        alloc(size);
    }
    /**
     * @brief Destructor that frees the allocated memory block.
     */
    ~ggml_cann_pool_alloc() {
        if (ptr != nullptr) {
            pool->free(ptr, actual_size);
        }
    }
    /**
     * @brief Allocates memory from the pool.
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
    void* alloc(size_t size) {
        GGML_ASSERT(pool != nullptr);
        GGML_ASSERT(ptr == nullptr);
        ptr = pool->alloc(size, &this->actual_size);
        return ptr;
    }
    /**
     * @brief Allocates memory from a specific memory pool.
     * @param pool Reference to the memory pool.
     * @param size Size of the memory block to allocate.
     * @return Pointer to the allocated memory block.
     */
    void* alloc(ggml_cann_pool& pool, size_t size) {
        this->pool = &pool;
        return alloc(size);
    }
    /**
     * @brief Gets the pointer to the allocated memory block.
     * @return Pointer to the allocated memory block.
     */
    void* get() { return ptr; }
    // Deleted copy constructor
    ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete;
    // Deleted move constructor
    ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete;
    // Deleted copy assignment operator
    ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete;
    // Deleted move assignment operator
    ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete;
 };
 /**
 * @brief Context for managing CANN backend operations.
 */
 struct ggml_backend_cann_context {
    int32_t device;                  /**< Device ID. */
    std::string name;                /**< Name of the device. */
    aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
    aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
        {nullptr}}; /**< Array of streams for the device. */
    /**
     * @brief Constructor for initializing the context with a given device.
     * @param device Device ID.
     */
    explicit ggml_backend_cann_context(int device)
        : device(device), name("CANN" + std::to_string(device)) {}
    /**
     * @brief Destructor for cleaning up resources.
     */
    ~ggml_backend_cann_context() {
        ggml_cann_set_device(device);
        if (copy_event != nullptr) {
            ACL_CHECK(aclrtDestroyEvent(copy_event));
        }
        for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) {
            if (streams[i] != nullptr) {
                ACL_CHECK(aclrtDestroyStream(streams[i]));
            }
        }
    }
    /**
     * @brief Get or create a stream for a given index.
     * @param stream Index of the stream.
     * @return The stream corresponding to the given index.
     */
    aclrtStream stream(int stream) {
        if (streams[stream] == nullptr) {
            ggml_cann_set_device(device);
            ACL_CHECK(aclrtCreateStream(&streams[stream]));
        }
        return streams[stream];
    }
    /**
     * @brief Get or create the default stream (index 0).
     * @return The default stream.
     */
    aclrtStream stream() { return stream(0); }
    // TODO: each stream should have a memory pool.
    std::unique_ptr<ggml_cann_pool>
        mem_pool; /**< Memory pool for the device. */
    /**
     * @brief Create a new memory pool for a given device.
     * @param device Device ID.
     * @return A unique pointer to the new memory pool.
     */
    static std::unique_ptr<ggml_cann_pool> new_pool_for_device(int device);
    /**
     * @brief Get or create the memory pool for the context.
     * @return Reference to the memory pool.
     */
    ggml_cann_pool& pool() {
        if (mem_pool == nullptr) {
            mem_pool = new_pool_for_device(device);
        }
        return *mem_pool;
    }
 };
 #endif  // CANN_COMMON_H
--- a/ggml/src/ggml-cann/kernels/CMakeLists.txt
+++ b/ggml/src/ggml-cann/kernels/CMakeLists.txt
@ -1,33 +0,0 @@
 if (NOT SOC_TYPE)
    set (SOC_TYPE "Ascend910B3")
 endif()
 file(GLOB SRC_FILES
    get_row_f32.cpp
    get_row_f16.cpp
    get_row_q4_0.cpp
    get_row_q8_0.cpp
    quantize_f32_q8_0.cpp
    quantize_f16_q8_0.cpp
    quantize_float_to_q4_0.cpp
    dup.cpp
 )
 string(TOLOWER ${SOC_TYPE} SOC_VERSION)
 set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
 set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
 if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
 elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake)
 else()
    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.")
 endif()
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
 ascendc_library(ascendc_kernels STATIC
    ${SRC_FILES}
 )
 # ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
--- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h
+++ b/ggml/src/ggml-cann/kernels/ascendc_kernels.h
@ -1,19 +0,0 @@
 #ifndef ASCENDC_KERNELS_H
 #define ASCENDC_KERNELS_H
 #include "aclrtlaunch_ascendc_get_row_f32.h"
 #include "aclrtlaunch_ascendc_get_row_f16.h"
 #include "aclrtlaunch_ascendc_get_row_q8_0.h"
 #include "aclrtlaunch_ascendc_get_row_q4_0.h"
 #include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
 #include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
 #include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
 #include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h"
 #include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h"
 #endif  // ASCENDC_KERNELS_H
--- a/ggml/src/ggml-cann/kernels/dup.cpp
+++ b/ggml/src/ggml-cann/kernels/dup.cpp
@ -1,223 +0,0 @@
 #include "kernel_operator.h"
 #include <cmath>
 using namespace AscendC;
 #define BUFFER_NUM 2
 template <typename SRC_T, typename DST_T>
 class DupByRows {
   public:
    __aicore__ inline DupByRows() {}
    __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub,
                                size_t *input_nb_ub) {
        /* Dup by rows when src is contigous on first dimension and dst is
        contiguous, each kernel process one row.
        */
        // Input has four dims.
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        // param
        num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
        num_elem = input_ne_ub[0];
        // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3)
        idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]);
        idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]))
                  / (input_ne_ub[1]);
        idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])
                - idx_ne2 * input_ne_ub[1];
        // src may not contiguous in dim [1,2,3], so stride decited by ne&nb
        src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2
                     + input_nb_ub[1] * idx_ne1;
        // dst is contiguous
        dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T));
        src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src +
                                                                src_stride));
        dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst +
                                                                dst_stride));
        pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem +
                                                32 - 1) / 32 * 32);
        pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem +
                                                32 - 1) / 32 * 32);
    }
    __aicore__ inline void copy_in() {
        LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
        DataCopyExtParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
        DataCopyPadExtParams<SRC_T> padParams;
        DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
        src_queue.EnQue(src_local);
    }
    __aicore__ inline void copy_out() {
        LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
        DataCopyExtParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = num_elem * sizeof(DST_T);
        DataCopyPad(dst_gm, dst_local, dataCopyParams);
        dst_queue.FreeTensor(dst_local);
    }
    __aicore__ inline void dup() {
        // main process, copy one row data from src to dst.
        copy_in();
        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
        int32_t BLOCK_NUM = 32 / sizeof(DST_T);
        DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1)
                                        / BLOCK_NUM * BLOCK_NUM);
        dst_queue.EnQue<DST_T>(dst_local);
        src_queue.FreeTensor(src_local);
        copy_out();
    }
    __aicore__ inline void dup_with_cast() {
        // main process, copy one row data from src to dst.
        // cast dtype from src to dst.
        copy_in();
        LocalTensor<SRC_T> src_local = src_queue.DeQue<SRC_T>();
        LocalTensor<DST_T> dst_local = dst_queue.AllocTensor<DST_T>();
        Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem);
        dst_queue.EnQue<DST_T>(dst_local);
        src_queue.FreeTensor(src_local);
        copy_out();
    }
   private:
    TPipe pipe;
    GlobalTensor<SRC_T> src_gm;
    GlobalTensor<DST_T> dst_gm;
    int64_t num_rows;
    int64_t num_elem;
    int64_t idx_ne3;
    int64_t idx_ne2;
    int64_t idx_ne1;
    int64_t src_stride;
    int64_t dst_stride;
    TQue<QuePosition::VECIN, BUFFER_NUM> src_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> dst_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<half, half> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup();
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<float_t, float_t> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup();
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<float_t, half> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup_with_cast();
 }
 extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32(
                                                        GM_ADDR src_gm,
                                                        GM_ADDR dst_gm,
                                                        GM_ADDR input_ne_gm,
                                                        GM_ADDR input_nb_gm,
                                                        GM_ADDR output_ne_gm,
                                                        GM_ADDR output_nb_gm) {
    // copy params from gm to ub.
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    DupByRows<half, float_t> op;
    op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub);
    op.dup_with_cast();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f16.cpp
@ -1,186 +0,0 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 class GET_ROW_F16 {
   public:
    __aicore__ inline GET_ROW_F16() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                int64_t *output_ne_ub, size_t *output_nb_ub) {
        // TODO, use template for F16/f32
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ half *)input);
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31)
                                             & ~31);
        uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31)
                                              & ~31);
        local_buffer_elems = input_local_buffer_size / sizeof(half);
        // TODO, consider long row that can't put in UB.
        // All data should asign to 32. It's ok because all data is align to 32.
        pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size);
        pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size);
    }
    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(input_local, input_gm[offset], len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(half);
            DataCopyPadExtParams<half> padParams;
            DataCopyPad(input_local[len], input_gm[offset + len],
                        dataCopyParams, padParams);
        }
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(output_gm[offset], output_local, len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPad(output_gm[offset + len], output_local[len],
                        dataCopyParams);
        }
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_row(int64_t idx) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3];
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3];
        copy_in(input_offset, input_ne[0]);
        LocalTensor<half> input_local = input_queue.DeQue<half>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        Cast(output_local, input_local, RoundMode::CAST_NONE,
             local_buffer_elems);
        output_queue.EnQue(output_local);
        copy_out(output_offset, input_ne[0]);
        input_queue.FreeTensor(input_local);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            calculate_row(i);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    size_t local_buffer_elems;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<half> input_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_f16(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_F16 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_f32.cpp
@ -1,180 +0,0 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 class GET_ROW_F32 {
   public:
    __aicore__ inline GET_ROW_F32() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *indices_ne_ub, size_t *indices_nb_ub,
                                int64_t *output_ne_ub, size_t *output_nb_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ float *)input);
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31);
        local_buffer_elems = local_buffer_size / sizeof(float);
        // TODO, consider long row that can't put in UB.
        // All data should asign to 32. It's ok because all data is align to 32.
        pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size);
        pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size);
    }
    __aicore__ inline void copy_in(uint32_t offset, size_t len) {
        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(input_local, input_gm[offset], len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPadExtParams<float> padParams;
            DataCopyPad(input_local[len], input_gm[offset + len],
                        dataCopyParams, padParams);
        }
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset, size_t len) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        size_t tail = len % 32;
        len = len & ~31;
        DataCopy(output_gm[offset], output_local, len);
        if(tail != 0) {
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = tail * sizeof(float);
            DataCopyPad(output_gm[offset + len], output_local[len],
                        dataCopyParams);
        }
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_row(int64_t idx) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3];
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3];
        copy_in(input_offset, input_ne[0]);
        LocalTensor<float> input_local = input_queue.DeQue<float>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        DataCopy(output_local, input_local, local_buffer_elems);
        output_queue.EnQue(output_local);
        copy_out(output_offset, input_ne[0]);
        input_queue.FreeTensor(input_local);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            calculate_row(i);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    size_t local_buffer_elems;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<float> input_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_f32(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm,
    GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_F32 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub,
            indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
@ -1,193 +0,0 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK4_0 32
 class GET_ROW_Q4_0 {
   public:
    __aicore__ inline GET_ROW_Q4_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
                                size_t *indices_nb_ub, int64_t *output_ne_ub,
                                size_t *output_nb_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            scale_ne[i] = input_ne_ub[i];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // one scale for a group.
        scale_ne[0] /= QK4_0;
        input_stride[0] = 1;
        scale_stride[0] = 1;
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        group_size_in_row = input_ne[0] / QK4_0;
        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
                               input_ne[3] / 2;
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ int4b_t *)input);
        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t));
        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float));
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<int4b_t> input_local = input_queue.AllocTensor<int4b_t>();
        // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error?
        DataCopy(input_local, input_gm[offset], QK4_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        DataCopy(output_gm[offset], output_local, QK4_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3] +
                                     group * QK4_0;
        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
                                     indices_ne1_idx * scale_stride[2] +
                                     indices_ne2_idx * scale_stride[3] + group;
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3] +
                                      group * QK4_0;
        copy_in(input_offset);
        LocalTensor<int4b_t> input_local = input_queue.DeQue<int4b_t>();
        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        // TODO: cast more data to speed up.
        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
        // Only mul need compile by group.
        half scale = scale_gm.GetValue(scale_offset);
        Muls(output_local, output_local, (float)scale, QK4_0);
        input_queue.FreeTensor(input_local);
        cast_queue.FreeTensor(cast_local);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                calculate_group(i, j);
            }
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t scale_ne[4];
    size_t scale_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t ir;
    int64_t dr;
    int64_t group_size_in_row;
    TPipe pipe;
    GlobalTensor<int4b_t> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_Q4_0 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
            indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp
@ -1,191 +0,0 @@
 #include "kernel_operator.h"
 // optimize me. Use template to avoid copy code.
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK8_0 32
 class GET_ROW_Q8_0 {
   public:
    __aicore__ inline GET_ROW_Q8_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output,
                                int64_t *input_ne_ub, int64_t *indices_ne_ub,
                                size_t *indices_nb_ub, int64_t *output_ne_ub,
                                size_t *output_nb_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            indices_ne[i] = indices_ne_ub[i];
            indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0];
            scale_ne[i] = input_ne_ub[i];
            output_ne[i] = output_ne_ub[i];
            output_stride[i] = output_nb_ub[i] / output_nb_ub[0];
        }
        // one scale for a group.
        scale_ne[0] /= QK8_0;
        input_stride[0] = 1;
        scale_stride[0] = 1;
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            input_stride[i] = input_stride[i - 1] * input_ne[i - 1];
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        group_size_in_row = input_ne[0] / QK8_0;
        int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] *
                               input_ne[3] * sizeof(int8_t);
        // Indices has two dims. n_elements = all rows should get.
        // dr, all rows should this thread get.
        uint64_t n_elements =
            indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3];
        dr = n_elements / op_block_num;
        uint64_t tails = n_elements % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        input_gm.SetGlobalBuffer((__gm__ int8_t *)input);
        scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset));
        indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices);
        output_gm.SetGlobalBuffer((__gm__ float *)output);
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
        pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float));
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<int8_t> input_local = input_queue.AllocTensor<int8_t>();
        DataCopy(input_local, input_gm[offset], QK8_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<float> output_local = output_queue.DeQue<float>();
        DataCopy(output_gm[offset], output_local, QK8_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void calculate_group(int64_t idx, int64_t group) {
        const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]);
        const int64_t indices_ne1_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) /
            indices_ne[0];
        const int64_t indices_ne0_idx =
            (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] -
             indices_ne1_idx * indices_ne[0]);
        const int64_t indices_offset = indices_ne0_idx * indices_stride[0] +
                                       indices_ne1_idx * indices_stride[1] +
                                       indices_ne2_idx * indices_stride[2];
        const int32_t selected_row_idx = indices_gm.GetValue(indices_offset);
        const int64_t input_offset = selected_row_idx * input_stride[1] +
                                     indices_ne1_idx * input_stride[2] +
                                     indices_ne2_idx * input_stride[3] +
                                     group * QK8_0;
        const int64_t scale_offset = selected_row_idx * scale_stride[1] +
                                     indices_ne1_idx * scale_stride[2] +
                                     indices_ne2_idx * scale_stride[3] + group;
        const int64_t output_offset = indices_ne0_idx * output_stride[1] +
                                      indices_ne1_idx * output_stride[2] +
                                      indices_ne2_idx * output_stride[3] +
                                      group * QK8_0;
        copy_in(input_offset);
        LocalTensor<int8_t> input_local = input_queue.DeQue<int8_t>();
        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
        LocalTensor<float> output_local = output_queue.AllocTensor<float>();
        // TODO: cast more data to speed up.
        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
        Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0);
        // Only mul need compile by group.
        half scale = scale_gm.GetValue(scale_offset);
        Muls(output_local, output_local, (float)scale, QK8_0);
        input_queue.FreeTensor(input_local);
        cast_queue.FreeTensor(cast_local);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
    }
    __aicore__ inline void calculate() {
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                calculate_group(i, j);
            }
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t scale_ne[4];
    size_t scale_stride[4];
    int64_t indices_ne[4];
    size_t indices_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t ir;
    int64_t dr;
    int64_t group_size_in_row;
    TPipe pipe;
    GlobalTensor<int8_t> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int32_t> indices_gm;
    GlobalTensor<float> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, BUFFER_NUM> cast_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_get_row_q8_0(
    GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
    GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
    GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
    int64_t input_ne_ub[4];
    int64_t indices_ne_ub[4];
    size_t indices_nb_ub[4];
    int64_t output_ne_ub[4];
    size_t output_nb_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(indices_ne_gm, indices_ne_ub, 32);
    copy_to_ub(indices_nb_gm, indices_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    copy_to_ub(output_nb_gm, output_nb_ub, 32);
    GET_ROW_Q8_0 op;
    op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub,
            indices_nb_ub, output_ne_ub, output_nb_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp
@ -1,208 +0,0 @@
 #include "kernel_operator.h"
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK8_0 32
 class QUANTIZE_F16_Q8_0 {
   public:
    __aicore__ inline QUANTIZE_F16_Q8_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *output_ne_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
        }
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
        }
        scale_ne = input_ne;
        scale_stride[0] = 1;
        scale_stride[1] = input_ne[0] / QK8_0;
        for (int i = 2; i < 4; i++) {
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        // split input tensor by rows.
        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
        dr = nr / op_block_num;
        uint64_t tails = nr % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        group_size_in_row = scale_stride[1];
        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
                              output_ne[3] * sizeof(uint8_t);
        input_gm.SetGlobalBuffer((__gm__ half *)input);
        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir *
                                                 group_size_in_row *
                                                 sizeof(half)));
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
        pipe.InitBuffer(work_queue, 1, 32);
        pipe.InitBuffer(max_queue, 1, 32);
        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
        pipe.InitBuffer(scale_queue, 1, 32);
        pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float));
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<half> input_local = input_queue.AllocTensor<half>();
        DataCopy(input_local, input_gm[offset], QK8_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
        DataCopy(output_gm[offset], output_local, QK8_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
        const int64_t i1 =
            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
        const int64_t input_offset = i1 * input_stride[1] +
                                     i2 * input_stride[2] +
                                     i3 * input_stride[3] + QK8_0 * group;
        const int64_t output_offset = i1 * output_stride[1] +
                                      i2 * output_stride[2] +
                                      i3 * output_stride[3] + QK8_0 * group;
        copy_in(input_offset);
        LocalTensor<half> input_local = input_queue.DeQue<half>();
        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
        Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0);
        Abs(abs_local, cast_local, QK8_0);
        ReduceMax(max_local, abs_local, work_local, QK8_0);
        pipe_barrier(PIPE_ALL);
        float d = max_local.GetValue(0);
        d = d / ((1 << 7) - 1);
        if (d != 0) {
            Muls(cast_local, cast_local, 1.0f / d, QK8_0);
        }
        Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
        input_queue.FreeTensor(input_local);
        work_queue.FreeTensor(work_local);
        abs_queue.FreeTensor(abs_local);
        max_queue.FreeTensor(max_local);
        cast_queue.FreeTensor(cast_local);
        return (half)d;
    }
    __aicore__ inline void calculate() {
        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
        uint32_t scale_local_offset = 0;
        uint32_t scale_global_offset = 0;
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                half scale = calculate_group(i, j);
                scale_local.SetValue(scale_local_offset++, scale);
                if (scale_local_offset == 16) {
                    scale_local_offset = 0;
                    // TODO: OPTIMIZE ME
                    pipe_barrier(PIPE_ALL);
                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
                    pipe_barrier(PIPE_ALL);
                    scale_global_offset += 16;
                }
            }
        }
        if (scale_local_offset != 0) {
            pipe_barrier(PIPE_ALL);
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
            DataCopyPad(scale_gm[scale_global_offset], scale_local,
                        dataCopyParams);
            pipe_barrier(PIPE_ALL);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t *scale_ne;
    size_t scale_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t group_size_in_row;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<half> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int8_t> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, 1> work_queue;
    TQue<QuePosition::VECOUT, 1> max_queue;
    TQue<QuePosition::VECIN, 1> abs_queue;
    TQue<QuePosition::VECOUT, 1> scale_queue;
    TQue<QuePosition::VECOUT, 1> cast_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    QUANTIZE_F16_Q8_0 op;
    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp
@ -1,206 +0,0 @@
 #include "kernel_operator.h"
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define QK8_0 32
 class QUANTIZE_F32_Q8_0 {
   public:
    __aicore__ inline QUANTIZE_F32_Q8_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *output_ne_ub) {
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
        }
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
        }
        scale_ne = input_ne;
        scale_stride[0] = 1;
        scale_stride[1] = input_ne[0] / QK8_0;
        for (int i = 2; i < 4; i++) {
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        // split input tensor by rows.
        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
        dr = nr / op_block_num;
        uint64_t tails = nr % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        group_size_in_row = scale_stride[1];
        int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] *
                              output_ne[3] * sizeof(uint8_t);
        input_gm.SetGlobalBuffer((__gm__ float *)input);
        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
        scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size +
                                                 ir * group_size_in_row *
                                                 sizeof(half)));
        pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float));
        pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t));
        pipe.InitBuffer(work_queue, 1, 32);
        pipe.InitBuffer(max_queue, 1, 32);
        pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float));
        pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half));
        pipe.InitBuffer(scale_queue, 1, 32);
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<float> input_local = input_queue.AllocTensor<float>();
        DataCopy(input_local, input_gm[offset], QK8_0);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        LocalTensor<int8_t> output_local = output_queue.DeQue<int8_t>();
        DataCopy(output_gm[offset], output_local, QK8_0);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
        const int64_t i1 =
            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
        const int64_t input_offset = i1 * input_stride[1] +
                                     i2 * input_stride[2] +
                                     i3 * input_stride[3] + QK8_0 * group;
        const int64_t output_offset = i1 * output_stride[1] +
                                      i2 * output_stride[2] +
                                      i3 * output_stride[3] + QK8_0 * group;
        copy_in(input_offset);
        LocalTensor<float> input_local = input_queue.DeQue<float>();
        LocalTensor<int8_t> output_local = output_queue.AllocTensor<int8_t>();
        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
        LocalTensor<float> abs_local = abs_queue.AllocTensor<float>();
        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
        LocalTensor<half> cast_local = cast_queue.AllocTensor<half>();
        Abs(abs_local, input_local, QK8_0);
        ReduceMax(max_local, abs_local, work_local, QK8_0);
        pipe_barrier(PIPE_ALL);
        float d = max_local.GetValue(0);
        d = d / ((1 << 7) - 1);
        if (d != 0) {
            Muls(input_local, input_local, 1.0f / d, QK8_0);
        }
        Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0);
        Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
        input_queue.FreeTensor(input_local);
        work_queue.FreeTensor(work_local);
        abs_queue.FreeTensor(abs_local);
        max_queue.FreeTensor(max_local);
        cast_queue.FreeTensor(cast_local);
        return (half)d;
    }
    __aicore__ inline void calculate() {
        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
        uint32_t scale_local_offset = 0;
        uint32_t scale_global_offset = 0;
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                half scale = calculate_group(i, j);
                scale_local.SetValue(scale_local_offset++, scale);
                if (scale_local_offset == 16) {
                    scale_local_offset = 0;
                    // TODO: OPTIMIZE ME
                    pipe_barrier(PIPE_ALL);
                    DataCopy(scale_gm[scale_global_offset], scale_local, 16);
                    pipe_barrier(PIPE_ALL);
                    scale_global_offset += 16;
                }
            }
        }
        if (scale_local_offset != 0) {
            pipe_barrier(PIPE_ALL);
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
            DataCopyPad(scale_gm[scale_global_offset], scale_local,
                        dataCopyParams);
            pipe_barrier(PIPE_ALL);
        }
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t *scale_ne;
    size_t scale_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t group_size_in_row;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<float> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int8_t> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, 1> work_queue;
    TQue<QuePosition::VECOUT, 1> max_queue;
    TQue<QuePosition::VECIN, 1> abs_queue;
    TQue<QuePosition::VECIN, 1> cast_queue;
    TQue<QuePosition::VECOUT, 1> scale_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    QUANTIZE_F32_Q8_0 op;
    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
+++ b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp
@ -1,278 +0,0 @@
 #include "kernel_operator.h"
 using namespace AscendC;
 #define BUFFER_NUM 2
 #define Group_Size 32
 template <typename SRC_T>
 class QUANTIZE_FLOAT_TO_Q4_0 {
   public:
    __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {}
    __aicore__ inline void init(GM_ADDR input, GM_ADDR output,
                                int64_t *input_ne_ub, size_t *input_nb_ub,
                                int64_t *output_ne_ub) {
        // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
        //                         permute=[0,0,0,0]):
        // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
        int64_t op_block_num = GetBlockNum();
        int64_t op_block_idx = GetBlockIdx();
        // input stride of data elements
        for (int i = 0; i < 4; i++) {
            input_ne[i] = input_ne_ub[i];
            input_stride[i] = input_nb_ub[i] / input_nb_ub[0];
            output_ne[i] = output_ne_ub[i];
        }
        // output stride of data elements
        output_stride[0] = 1;
        for (int i = 1; i < 4; i++) {
            output_stride[i] = output_stride[i - 1] * output_ne[i - 1];
        }
        // scale saved one by one after data:. [group1_scale, group2_scale, ...]
        scale_ne = input_ne;
        scale_stride[0] = 1;
        scale_stride[1] = input_ne[0] / Group_Size;
        for (int i = 2; i < 4; i++) {
            scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1];
        }
        // split input tensor by rows.
        uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3];
        dr = nr / op_block_num;
        uint64_t tails = nr % op_block_num;
        if (op_block_idx < tails) {
            dr += 1;
            ir = dr * op_block_idx;
        } else {
            ir = dr * op_block_idx + tails;
        }
        group_size_in_row = scale_stride[1];
        int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] *
                              output_ne[3] * sizeof(uint8_t) / 2;
        input_gm.SetGlobalBuffer((__gm__ SRC_T *)input);
        output_gm.SetGlobalBuffer((__gm__ int8_t *)output);
        scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir *
                                                 group_size_in_row *
                                                 sizeof(half)));
        pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T));
        pipe.InitBuffer(output_queue, BUFFER_NUM,
                            Group_Size * sizeof(int8_t) / 2);
        pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float));
        pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float));
        pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float));
        pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float));
        pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half));
        pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t));
        pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half));
    }
    __aicore__ inline void copy_in(uint32_t offset) {
        LocalTensor<SRC_T> input_local = input_queue.AllocTensor<SRC_T>();
        DataCopy(input_local, input_gm[offset], Group_Size);
        input_queue.EnQue(input_local);
    }
    __aicore__ inline void copy_out(uint32_t offset) {
        // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t,
        // and using DataCopyPad to avoid 32 bits align.
        LocalTensor<int4b_t> output_local = output_queue.DeQue<int4b_t>();
        LocalTensor<int8_t> output_int8_local =
                                    output_local.ReinterpretCast<int8_t>();
        DataCopyExtParams dataCopyParams;
        dataCopyParams.blockCount = 1;
        dataCopyParams.blockLen = Group_Size / 2  * sizeof(int8_t);
        DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams);
        output_queue.FreeTensor(output_local);
    }
    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
                                         LocalTensor<float> input_local) {
        DataCopy(cast_local, input_local, Group_Size);
    }
    __aicore__ inline void input_to_cast(LocalTensor<float> cast_local,
                                         LocalTensor<half> input_local) {
        Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size);
    }
    __aicore__ inline half calculate_group(int64_t row, int64_t group) {
        const int64_t i3 = row / (input_ne[1] * input_ne[2]);
        const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1];
        const int64_t i1 =
            row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1];
        const int64_t input_offset = i1 * input_stride[1] +
                                     i2 * input_stride[2] +
                                     i3 * input_stride[3] + Group_Size * group;
        // output_offset is stride for output_gm which datatype is int8_t and
        // divided by 2 is needed for int4b_t.
        const int64_t output_offset = (i1 * output_stride[1] +
                                       i2 * output_stride[2] +
                                       i3 * output_stride[3] +
                                       Group_Size * group) / 2;
        copy_in(input_offset);
        LocalTensor<SRC_T> input_local = input_queue.DeQue<SRC_T>();
        LocalTensor<int4b_t> output_local = output_queue.AllocTensor<int4b_t>();
        LocalTensor<float> cast_local = cast_queue.AllocTensor<float>();
        LocalTensor<float> work_local = work_queue.AllocTensor<float>();
        LocalTensor<float> max_local = max_queue.AllocTensor<float>();
        LocalTensor<float> min_local = min_queue.AllocTensor<float>();
        LocalTensor<int8_t> int8_local = int8_queue.AllocTensor<int8_t>();
        LocalTensor<half> half_local = half_queue.AllocTensor<half>();
        input_to_cast(cast_local, input_local);
        ReduceMax(max_local, cast_local, work_local, Group_Size);
        ReduceMin(min_local, cast_local, work_local, Group_Size);
        const float max_value = max_local.GetValue(0);
        const float min_value = min_local.GetValue(0);
        float d = max_value;
        if (min_value < 0 && (-1 * min_value) > max_value) {
            d = min_value;
        }
        d = d / (-8);
        if (d != 0) {
            Muls(cast_local, cast_local, 1.0f / d, Group_Size);
        }
        // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7]
        float scalar = 8.5f;
        Adds(cast_local, cast_local, scalar, Group_Size);
        Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size);
        scalar = 15.0f;
        Mins(cast_local, cast_local, scalar, Group_Size);
        scalar = -8.0f;
        Adds(cast_local, cast_local, scalar, Group_Size);
        // float->half->int4b
        Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size);
        Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size);
        output_queue.EnQue(output_local);
        copy_out(output_offset);
        input_queue.FreeTensor(input_local);
        work_queue.FreeTensor(work_local);
        max_queue.FreeTensor(max_local);
        min_queue.FreeTensor(min_local);
        int8_queue.FreeTensor(int8_local);
        half_queue.FreeTensor(half_local);
        cast_queue.FreeTensor(cast_local);
        return (half)d;
    }
    __aicore__ inline void calculate() {
        LocalTensor<half> scale_local = scale_queue.AllocTensor<half>();
        uint32_t scale_local_offset = 0;
        uint32_t scale_global_offset = 0;
        for (int64_t i = ir; i < ir + dr; i++) {
            for (int64_t j = 0; j < group_size_in_row; j++) {
                half scale = calculate_group(i, j);
                scale_local.SetValue(scale_local_offset++, scale);
                // Copy Group_Size/2 length data each time.
                if (scale_local_offset == Group_Size / 2) {
                    scale_local_offset = 0;
                    // TODO: OPTIMIZE ME
                    pipe_barrier(PIPE_ALL);
                    DataCopy(scale_gm[scale_global_offset], scale_local,
                                      Group_Size / 2);
                    pipe_barrier(PIPE_ALL);
                    scale_global_offset += Group_Size / 2;
                }
            }
        }
        if (scale_local_offset != 0) {
            pipe_barrier(PIPE_ALL);
            DataCopyExtParams dataCopyParams;
            dataCopyParams.blockCount = 1;
            dataCopyParams.blockLen = scale_local_offset * sizeof(half);
            DataCopyPad(scale_gm[scale_global_offset], scale_local,
                        dataCopyParams);
            pipe_barrier(PIPE_ALL);
        }
        scale_queue.FreeTensor(scale_local);
    }
   private:
    int64_t input_ne[4];
    size_t input_stride[4];
    int64_t *scale_ne;
    size_t scale_stride[4];
    int64_t output_ne[4];
    size_t output_stride[4];
    int64_t group_size_in_row;
    int64_t ir;
    int64_t dr;
    TPipe pipe;
    GlobalTensor<SRC_T> input_gm;
    GlobalTensor<half> scale_gm;
    GlobalTensor<int8_t> output_gm;
    TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
    TQue<QuePosition::VECIN, BUFFER_NUM> work_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> max_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> min_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> scale_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> cast_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> int8_queue;
    TQue<QuePosition::VECOUT, BUFFER_NUM> half_queue;
 };
 template <typename T>
 __aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) {
    auto gm_ptr = (__gm__ uint8_t *)gm;
    auto ub_ptr = (uint8_t *)(ub);
    for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) {
        *ub_ptr = *gm_ptr;
    }
 }
 extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    QUANTIZE_FLOAT_TO_Q4_0<half> op;
    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
    op.calculate();
 }
 extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
    GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
    GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
    int64_t input_ne_ub[4];
    size_t input_nb_ub[4];
    int64_t output_ne_ub[4];
    copy_to_ub(input_ne_gm, input_ne_ub, 32);
    copy_to_ub(input_nb_gm, input_nb_ub, 32);
    copy_to_ub(output_ne_gm, output_ne_ub, 32);
    QUANTIZE_FLOAT_TO_Q4_0<float> op;
    op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
    op.calculate();
 }
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@ -19,11 +19,7 @@ typedef half2 ggml_half2;
 #define GGML_COMMON_DECL
 #elif defined(GGML_COMMON_DECL_CUDA)
 #if defined(GGML_COMMON_DECL_MUSA)
 #include <musa_fp16.h>
 #else
 #include <cuda_fp16.h>
 #endif
 #include <cstdint>
 typedef half  ggml_half;
@ -110,19 +106,19 @@ typedef sycl::half2 ggml_half2;
 #define QR6_K 2
 #define QI2_XXS (QK_K / (4*QR2_XXS))
-#define QR2_XXS 4
+#define QR2_XXS 8
 #define QI2_XS (QK_K / (4*QR2_XS))
-#define QR2_XS 4
+#define QR2_XS 8
 #define QI2_S (QK_K / (4*QR2_S))
-#define QR2_S 4
+#define QR2_S 8
 #define QI3_XXS (QK_K / (4*QR3_XXS))
-#define QR3_XXS 4
+#define QR3_XXS 8
 #define QI3_XS (QK_K / (4*QR3_XS))
-#define QR3_XS 4
+#define QR3_XS 8
 #define QI1_S (QK_K / (4*QR1_S))
 #define QR1_S 8
@ -134,10 +130,10 @@ typedef sycl::half2 ggml_half2;
 #define QR4_NL 2
 #define QI4_XS (QK_K / (4*QR4_XS))
-#define QR4_XS 2
+#define QR4_XS 8
 #define QI3_S (QK_K / (4*QR3_S))
-#define QR3_S 4
+#define QR3_S 8
 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
@ -203,49 +199,6 @@ typedef struct {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
 typedef struct {
    ggml_half d[4];        // deltas for 4 q4_0 blocks
    uint8_t qs[QK4_0 * 2]; // nibbles / quants for 4 q4_0 blocks
 } block_q4_0x4;
 static_assert(sizeof(block_q4_0x4) == 4 * sizeof(ggml_half) + QK4_0 * 2, "wrong q4_0x4 block size/padding");
 typedef struct {
    ggml_half d[8];        // deltas for 8 q4_0 blocks
    uint8_t qs[QK4_0 * 4]; // nibbles / quants for 8 q4_0 blocks
 } block_q4_0x8;
 static_assert(sizeof(block_q4_0x8) == 8 * sizeof(ggml_half) + QK4_0 * 4, "wrong q4_0x8 block size/padding");
 typedef struct {
    ggml_half d[4];        // deltas for 4 q8_0 blocks
    int8_t qs[QK8_0 * 4];  // quants for 4 q8_0 blocks
 } block_q8_0x4;
 static_assert(sizeof(block_q8_0x4) == 4 * sizeof(ggml_half) + QK8_0 * 4, "wrong q8_0x4 block size/padding");
 typedef struct {
    ggml_half d[8];        // deltas for 8 q8_0 blocks
    int8_t qs[QK8_0 * 8];  // quants for 8 q8_0 blocks
 } block_q8_0x8;
 static_assert(sizeof(block_q8_0x8) == 8 * sizeof(ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
 //
 // Ternary quantization
 //
 // 1.6875 bpw
 typedef struct {
    uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
    uint8_t qh[QK_K/64]; // 4 elements per byte
    ggml_half d;
 } block_tq1_0;
 static_assert(sizeof(block_tq1_0) == sizeof(ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
 // 2.0625 bpw
 typedef struct {
    uint8_t qs[QK_K/4]; // 2 bits per element
    ggml_half d;
 } block_tq2_0;
 static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
 //
 // Super-block quantization structures
 //
@ -380,7 +333,6 @@ typedef struct {
 } block_iq3_s;
 static_assert(sizeof(block_iq3_s) == sizeof(ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
 // 1.5625 bpw
 typedef struct {
    ggml_half d;
    uint8_t  qs[QK_K/8];
@ -439,7 +391,7 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_
 #define GGML_TABLE_END() };
 #define GGML_COMMON_IMPL
-#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP) || defined(GGML_COMMON_IMPL_MUSA)
+#elif defined(GGML_COMMON_IMPL_CUDA) || defined(GGML_COMMON_IMPL_HIP)
 #include <cstdint>
 #define GGML_TABLE_BEGIN(type, name, size) static const __device__ type name[size] = {
--- a/ggml/src/ggml-cpu-impl.h
+++ b/ggml/src/ggml-cpu-impl.h
@ -1,614 +0,0 @@
 #pragma once
 // GGML CPU internal header
 #include "ggml.h"
 #include "ggml-impl.h"
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 //#include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
 #include <math.h>   // fabsf
 #ifdef __cplusplus
 extern "C" {
 #endif
 #if defined(_MSC_VER)
 #define m512bh(p) p
 #define m512i(p) p
 #else
 #define m512bh(p) (__m512bh)(p)
 #define m512i(p) (__m512i)(p)
 #endif
 /**
 * Converts brain16 to float32.
 *
 * The bfloat16 floating point format has the following structure:
 *
 *       ┌sign
 *       │
 *       │   ┌exponent
 *       │   │
 *       │   │      ┌mantissa
 *       │   │      │
 *       │┌──┴───┐┌─┴───┐
 *     0b0000000000000000 brain16
 *
 * Since bf16 has the same number of exponent bits as a 32bit float,
 * encoding and decoding numbers becomes relatively straightforward.
 *
 *       ┌sign
 *       │
 *       │   ┌exponent
 *       │   │
 *       │   │      ┌mantissa
 *       │   │      │
 *       │┌──┴───┐┌─┴───────────────────┐
 *     0b00000000000000000000000000000000 IEEE binary32
 *
 * For comparison, the standard fp16 format has fewer exponent bits.
 *
 *       ┌sign
 *       │
 *       │  ┌exponent
 *       │  │
 *       │  │    ┌mantissa
 *       │  │    │
 *       │┌─┴─┐┌─┴──────┐
 *     0b0000000000000000 IEEE binary16
 *
 * @see IEEE 754-2008
 */
 static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
    union {
        float f;
        uint32_t i;
    } u;
    u.i = (uint32_t)h.bits << 16;
    return u.f;
 }
 /**
 * Converts float32 to brain16.
 *
 * This is binary identical with Google Brain float conversion.
 * Floats shall round to nearest even, and NANs shall be quiet.
 * Subnormals aren't flushed to zero, except perhaps when used.
 * This code should vectorize nicely if using modern compilers.
 */
 static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
    ggml_bf16_t h;
    union {
        float f;
        uint32_t i;
    } u;
    u.f = s;
    if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
        h.bits = (u.i >> 16) | 64; /* force to quiet */
        return h;
    }
    h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
    return h;
 }
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __FMA__
 #define __FMA__
 #endif
 #ifndef __F16C__
 #define __F16C__
 #endif
 #endif
 // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
 #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __SSE3__
 #define __SSE3__
 #endif
 #ifndef __SSSE3__
 #define __SSSE3__
 #endif
 #endif
 #if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 #include <sys/prctl.h>
 #endif
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
 #if defined(__ARM_NEON)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 //
 #include <arm_neon.h>
 #ifdef _MSC_VER
 typedef uint16_t ggml_fp16_internal_t;
 #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
 #else
 typedef __fp16 ggml_fp16_internal_t;
 #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
 #endif // _MSC_VER
 #if !defined(__aarch64__)
 // 32-bit ARM compatibility
 // vaddlvq_s16
 // vpaddq_s16
 // vpaddq_s32
 // vaddvq_s32
 // vaddvq_f32
 // vmaxvq_f32
 // vcvtnq_s32_f32
 // vzip1_u8
 // vzip2_u8
 inline static int32_t vaddlvq_s16(int16x8_t v) {
    int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
    return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
 }
 inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
    int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
    int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
    return vcombine_s16(a0, b0);
 }
 inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
    int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
    int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
    return vcombine_s32(a0, b0);
 }
 inline static int32_t vaddvq_s32(int32x4_t v) {
    return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
 }
 inline static float vaddvq_f32(float32x4_t v) {
    return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
 }
 inline static float vmaxvq_f32(float32x4_t v) {
    return
        MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
            MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
 }
 inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
    int32x4_t res;
    res[0] = roundf(vgetq_lane_f32(v, 0));
    res[1] = roundf(vgetq_lane_f32(v, 1));
    res[2] = roundf(vgetq_lane_f32(v, 2));
    res[3] = roundf(vgetq_lane_f32(v, 3));
    return res;
 }
 inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
    uint8x8_t res;
    res[0] = a[0]; res[1] = b[0];
    res[2] = a[1]; res[3] = b[1];
    res[4] = a[2]; res[5] = b[2];
    res[6] = a[3]; res[7] = b[3];
    return res;
 }
 inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
    uint8x8_t res;
    res[0] = a[4]; res[1] = b[4];
    res[2] = a[5]; res[3] = b[5];
    res[4] = a[6]; res[5] = b[6];
    res[6] = a[7]; res[7] = b[7];
    return res;
 }
 // vld1q_s16_x2
 // vld1q_u8_x2
 // vld1q_u8_x4
 // vld1q_s8_x2
 // vld1q_s8_x4
 // TODO: double-check these work correctly
 typedef struct ggml_int16x8x2_t {
    int16x8_t val[2];
 } ggml_int16x8x2_t;
 inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
    ggml_int16x8x2_t res;
    res.val[0] = vld1q_s16(ptr + 0);
    res.val[1] = vld1q_s16(ptr + 8);
    return res;
 }
 typedef struct ggml_uint8x16x2_t {
    uint8x16_t val[2];
 } ggml_uint8x16x2_t;
 inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
    ggml_uint8x16x2_t res;
    res.val[0] = vld1q_u8(ptr + 0);
    res.val[1] = vld1q_u8(ptr + 16);
    return res;
 }
 typedef struct ggml_uint8x16x4_t {
    uint8x16_t val[4];
 } ggml_uint8x16x4_t;
 inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
    ggml_uint8x16x4_t res;
    res.val[0] = vld1q_u8(ptr + 0);
    res.val[1] = vld1q_u8(ptr + 16);
    res.val[2] = vld1q_u8(ptr + 32);
    res.val[3] = vld1q_u8(ptr + 48);
    return res;
 }
 typedef struct ggml_int8x16x2_t {
    int8x16_t val[2];
 } ggml_int8x16x2_t;
 inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
    ggml_int8x16x2_t res;
    res.val[0] = vld1q_s8(ptr + 0);
    res.val[1] = vld1q_s8(ptr + 16);
    return res;
 }
 typedef struct ggml_int8x16x4_t {
    int8x16_t val[4];
 } ggml_int8x16x4_t;
 inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
    ggml_int8x16x4_t res;
    res.val[0] = vld1q_s8(ptr + 0);
    res.val[1] = vld1q_s8(ptr + 16);
    res.val[2] = vld1q_s8(ptr + 32);
    res.val[3] = vld1q_s8(ptr + 48);
    return res;
 }
 // NOTE: not tested
 inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
    int8x16_t res;
    res[ 0] = a[b[ 0]];
    res[ 1] = a[b[ 1]];
    res[ 2] = a[b[ 2]];
    res[ 3] = a[b[ 3]];
    res[ 4] = a[b[ 4]];
    res[ 5] = a[b[ 5]];
    res[ 6] = a[b[ 6]];
    res[ 7] = a[b[ 7]];
    res[ 8] = a[b[ 8]];
    res[ 9] = a[b[ 9]];
    res[10] = a[b[10]];
    res[11] = a[b[11]];
    res[12] = a[b[12]];
    res[13] = a[b[13]];
    res[14] = a[b[14]];
    res[15] = a[b[15]];
    return res;
 }
 // NOTE: not tested
 inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
    uint8x16_t res;
    res[ 0] = a[b[ 0]];
    res[ 1] = a[b[ 1]];
    res[ 2] = a[b[ 2]];
    res[ 3] = a[b[ 3]];
    res[ 4] = a[b[ 4]];
    res[ 5] = a[b[ 5]];
    res[ 6] = a[b[ 6]];
    res[ 7] = a[b[ 7]];
    res[ 8] = a[b[ 8]];
    res[ 9] = a[b[ 9]];
    res[10] = a[b[10]];
    res[11] = a[b[11]];
    res[12] = a[b[12]];
    res[13] = a[b[13]];
    res[14] = a[b[14]];
    res[15] = a[b[15]];
    return res;
 }
 #else
 #define ggml_int16x8x2_t  int16x8x2_t
 #define ggml_uint8x16x2_t uint8x16x2_t
 #define ggml_uint8x16x4_t uint8x16x4_t
 #define ggml_int8x16x2_t  int8x16x2_t
 #define ggml_int8x16x4_t  int8x16x4_t
 #define ggml_vld1q_s16_x2 vld1q_s16_x2
 #define ggml_vld1q_u8_x2  vld1q_u8_x2
 #define ggml_vld1q_u8_x4  vld1q_u8_x4
 #define ggml_vld1q_s8_x2  vld1q_s8_x2
 #define ggml_vld1q_s8_x4  vld1q_s8_x4
 #define ggml_vqtbl1q_s8   vqtbl1q_s8
 #define ggml_vqtbl1q_u8   vqtbl1q_u8
 #endif // !defined(__aarch64__)
 #if !defined(__ARM_FEATURE_DOTPROD)
 inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
    const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
    const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
    return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
 }
 #else
 #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
 #endif // !defined(__ARM_FEATURE_DOTPROD)
 #endif // defined(__ARM_NEON)
 #if defined(__ARM_NEON) && !defined(_MSC_VER)
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    ggml_fp16_internal_t tmp;
    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
    return (float)tmp;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    ggml_fp16_t res;
    ggml_fp16_internal_t tmp = f;
    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
    return res;
 }
 #else
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
 #ifdef __POWER9_VECTOR__
 #include <altivec.h>
 #undef bool
 #define bool _Bool
 #else
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
 #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__)
 #if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
 #endif
 #endif
 #endif
 #ifdef __riscv_v_intrinsic
 #include <riscv_vector.h>
 #endif
 #if defined(__loongarch64)
 #if defined(__loongarch_asx)
 #include <lasxintrin.h>
 #endif
 #if defined(__loongarch_sx)
 #include <lsxintrin.h>
 #endif
 #endif
 #if defined(__loongarch_asx)
 typedef union {
    int32_t i;
    float f;
 } ft_union;
 /* float type data load instructions */
 static __m128 __lsx_vreplfr2vr_s(float val) {
    ft_union fi_tmpval = {.f = val};
    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
 }
 static __m256 __lasx_xvreplfr2vr_s(float val) {
    ft_union fi_tmpval = {.f = val};
    return (__m256)__lasx_xvreplgr2vr_w(fi_tmpval.i);
 }
 #endif
 #ifdef __F16C__
 #ifdef _MSC_VER
 #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
 #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
 #else
 #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 #endif
 #elif defined(__POWER9_VECTOR__)
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 /* the inline asm below is about 12% faster than the lookup method */
 #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    register float f;
    register double d;
    __asm__(
        "mtfprd %0,%2\n"
        "xscvhpdp %0,%0\n"
        "frsp %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=f"(f):
        /* in */   "r"(h));
    return f;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    register double d;
    register ggml_fp16_t r;
    __asm__( /* xscvdphp can work on double or single precision */
        "xscvdphp %0,%2\n"
        "mffprd %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=r"(r):
        /* in */   "f"(f));
    return r;
 }
 #else
 // FP16 <-> FP32
 // ref: https://github.com/Maratyszcza/FP16
 static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
        float as_value;
    } fp32;
    fp32.as_bits = w;
    return fp32.as_value;
 }
 static inline uint32_t fp32_to_bits(float f) {
    union {
        float as_value;
        uint32_t as_bits;
    } fp32;
    fp32.as_value = f;
    return fp32.as_bits;
 }
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    const uint32_t w = (uint32_t) h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t two_w = w + w;
    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float exp_scale = 0x1.0p-112f;
 #else
    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
 #endif
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
    const uint32_t magic_mask = UINT32_C(126) << 23;
    const float magic_bias = 0.5f;
    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
    const uint32_t result = sign |
        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
    return fp32_from_bits(result);
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
    const float scale_to_inf = 0x1.0p+112f;
    const float scale_to_zero = 0x1.0p-110f;
 #else
    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
 #endif
    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
    const uint32_t w = fp32_to_bits(f);
    const uint32_t shl1_w = w + w;
    const uint32_t sign = w & UINT32_C(0x80000000);
    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
    if (bias < UINT32_C(0x71000000)) {
        bias = UINT32_C(0x71000000);
    }
    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
    const uint32_t bits = fp32_to_bits(base);
    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #endif // __F16C__
 #endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
 #ifdef __ARM_FEATURE_SVE
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
    return ggml_table_f32_f16[s];
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 #endif
 #if !defined(GGML_FP32_TO_FP16)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 #ifdef __cplusplus
 }
 #endif
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
--- a/ggml/src/ggml-cuda/argmax.cu
+++ b/ggml/src/ggml-cuda/argmax.cu
@ -1,79 +0,0 @@
 #include "common.cuh"
 #include "argmax.cuh"
 #include "sum.cuh"
 #include <cstdint>
 static __global__ void argmax_f32(
    const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) {
    int argmax_thread = 0;
    const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE;
 #pragma unroll
    for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) {
        const int64_t row = row0 + row1;
        if (row >= nrows) {
            break;
        }
        float maxval = -FLT_MAX;
        int   argmax = -1;
        for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) {
            const float val        = x[row*ncols + col];
            const int   bigger     = val > maxval;
            const int   not_bigger = bigger ^ 0x00000001;
            maxval = maxval*not_bigger + val*bigger;
            argmax = argmax*not_bigger + col*bigger;
        }
 #pragma unroll
        for (int mask = 16; mask > 0; mask >>= 1) {
            const float val        = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE);
            const int   col        = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE);
            const int   bigger     = val > maxval;
            const int   not_bigger = bigger ^ 0x00000001;
            maxval = maxval*not_bigger + val*bigger;
            argmax = argmax*not_bigger + col*bigger;
        }
        const int store = row1 == threadIdx.x;
        argmax_thread += store*argmax;
    }
    const int row = row0 + threadIdx.x;
    if (row >= nrows) {
        return;
    }
    dst[row] = argmax_thread;
 }
 void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_I32);
    GGML_ASSERT(ggml_is_contiguous(src0));
    const int64_t ne00  = src0->ne[0];
    const int64_t nrows = ggml_nrows(src0);
    const float * src0_d = (const float *) src0->data;
    int32_t     * dst_d  = (int32_t     *) dst->data;
    cudaStream_t stream = ctx.stream();
    const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE;
    const dim3 blocks_dim(WARP_SIZE, 1, 1);
    const dim3 blocks_num(num_blocks, 1, 1);
    argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00, nrows);
 }
--- a/ggml/src/ggml-cuda/argmax.cuh
+++ b/ggml/src/ggml-cuda/argmax.cuh
@ -1,3 +0,0 @@
 #include "common.cuh"
 void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/argsort.cu
+++ b/ggml/src/ggml-cuda/argsort.cu
@ -81,7 +81,7 @@ static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, co
    } else if (order == GGML_SORT_ORDER_DESC) {
        k_argsort_f32_i32<GGML_SORT_ORDER_DESC><<<block_nums, block_dims, shared_mem, stream>>>(x, dst, ncols, ncols_pad);
    } else {
-        GGML_ABORT("fatal error");
+        GGML_ASSERT(false);
    }
 }
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@ -1,5 +1,4 @@
 #include "binbcast.cuh"
 #include <cstdint>
 static __device__ __forceinline__ float op_repeat(const float a, const float b) {
    return b;
@ -10,10 +9,6 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
    return a + b;
 }
 static __device__ __forceinline__ float op_sub(const float a, const float b) {
    return a - b;
 }
 static __device__ __forceinline__ float op_mul(const float a, const float b) {
    return a * b;
 }
@ -91,30 +86,6 @@ static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * s
    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
 }
 template <typename T>
 static __global__ void k_repeat_back(
    const T * __restrict__ src, T * __restrict__ dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
    const int64_t ne0, const int64_t ne1, const int64_t ne2) {
    const int64_t tid0 = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
    const int64_t tid1 = (int64_t) blockIdx.y*blockDim.y + threadIdx.y;
    const int64_t tid2 = (int64_t) blockIdx.z*blockDim.z + threadIdx.z;
    if (tid0 >= ne0) {
        return;
    }
    T sum = 0;
    for (int64_t i2 = tid2; i2 < ne02; i2 += ne2) {
        for (int64_t i1 = tid1; i1 < ne01; i1 += ne1) {
            for (int64_t i0 = tid0; i0 < ne00; i0 += ne0) {
                sum += src[i2*ne01*ne00 + i1*ne00 + i0];
            }
        }
    }
    dst[tid2*ne1*ne0 + tid1*ne0 + tid0] = sum;
 }
 template<float (*bin_op)(const float, const float)>
 struct bin_bcast_cuda {
    template<typename src0_t, typename src1_t, typename dst_t>
@ -272,16 +243,6 @@ struct bin_bcast_cuda {
    }
 };
 template <typename T>
 static void repeat_back_cuda(
    const T * src, T * dst, const int64_t ne00, const int64_t ne01, const int64_t ne02,
    const int64_t ne0, const int64_t ne1, const int64_t ne2, cudaStream_t stream) {
    const dim3 block_dims(WARP_SIZE, 1, 1);
    const dim3 block_nums((ne0 + WARP_SIZE - 1) / WARP_SIZE, ne1, ne2);
    k_repeat_back<T><<<block_nums, block_dims, 0, stream>>>(src, dst, ne00, ne01, ne02, ne0, ne1, ne2);
 }
 template<class op>
 static void ggml_cuda_op_bin_bcast(
    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
@ -298,7 +259,7 @@ static void ggml_cuda_op_bin_bcast(
    } else {
        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ABORT("fatal error");
+        GGML_ASSERT(false);
    }
 }
@ -310,10 +271,6 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
@ -321,35 +278,3 @@ void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 void ggml_cuda_op_repeat_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    GGML_ASSERT(src0->type == dst->type);
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(ggml_is_contiguous(dst));
    GGML_ASSERT(ggml_can_repeat(dst, src0));
    cudaStream_t stream = ctx.stream();
    const int64_t ne00 = src0->ne[0];
    const int64_t ne01 = src0->ne[1];
    const int64_t ne02 = src0->ne[2];
    GGML_ASSERT(src0->ne[3] == 1);
    const int64_t ne0 = dst->ne[0];
    const int64_t ne1 = dst->ne[1];
    const int64_t ne2 = dst->ne[2];
    GGML_ASSERT(dst->ne[3] == 1);
    switch (dst->type) {
        case GGML_TYPE_F32: {
            const float * src0_d = (const float *) src0->data;
            float       * dst_d  = (float       *) dst->data;
            repeat_back_cuda<float>(src0_d, dst_d, ne00, ne01, ne02, ne0, ne1, ne2, stream);
        } break;
        default: {
            GGML_ASSERT(false);
        } break;
    }
 }
--- a/Show More
+++ b/Show More
		`@ -1,3 +0,0 @@`
			`#include "common.cuh"`

			`void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);`