coreml : attempt to fix ANE-optimized models

2025-07-04 16:30:58 +02:00 · 2023-07-11 23:03:53 +03:00
34 changed files with 463 additions and 26407 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -1,41 +1,31 @@
 name: CI
 on: [push, pull_request]
 env:
  ubuntu_image: "ubuntu:22.04"
 jobs:
  ubuntu-latest:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
-      - name: Set up QEMU
+      - name: Dependencies
-        uses: docker/setup-qemu-action@v2
+        run: |
-
+          sudo apt-get update
-      - name: Build ${{ matrix.arch }}
+          sudo apt-get install build-essential
          sudo apt-get install libsdl2-dev
      - name: Build
        run: |
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            apt update
            apt install -y build-essential libsdl2-dev
          make
-            make stream'
+          make stream
  macOS-latest:
    runs-on: macOS-latest
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
      - name: Dependencies
        run: |
@ -47,104 +37,82 @@ jobs:
          make
          make stream
  freeBSD-latest:
    runs-on: macos-12
    steps:
      - name: Clone
        uses: actions/checkout@v3
      - name: Build
        uses: cross-platform-actions/action@v0.15.0
        with:
          operating_system: freebsd
          version: '13.2'
          run: |
            sudo pkg update
            sudo pkg install -y gmake sdl2
            gmake
            gmake stream
  ubuntu-latest-gcc:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        build: [Debug, Release]
        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
-      - name: Set up QEMU
+      - name: Dependencies
-        uses: docker/setup-qemu-action@v2
+        run: |
-
+          sudo apt-get update
-      - name: Build ${{ matrix.arch }}
+          sudo apt-get install build-essential
          sudo apt-get install cmake
          sudo apt-get install libsdl2-dev
      - name: Configure
        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
      - name: Build
        run: |
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            apt update
            apt install -y build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
          make
-            ctest -L gh --output-on-failure'
+          ctest -L gh --output-on-failure
  ubuntu-latest-clang:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        build: [Debug, Release]
        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
-      - name: Set up QEMU
+      - name: Dependencies
-        uses: docker/setup-qemu-action@v2
+        run: |
-
+          sudo apt-get update
-      - name: Build ${{ matrix.arch }}
+          sudo apt-get install build-essential
          sudo apt-get install cmake
          sudo apt-get install libsdl2-dev
      - name: Configure
        run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
      - name: Build
        run: |
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            apt update
            apt install -y build-essential cmake libsdl2-dev
            cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
          make
-            ctest -L gh --output-on-failure'
+          ctest -L gh --output-on-failure
  ubuntu-latest-gcc-sanitized:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        sanitizer: [ADDRESS, THREAD, UNDEFINED]
        arch: [linux/amd64]
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
-      - name: Set up QEMU
+      - name: Dependencies
-        uses: docker/setup-qemu-action@v2
+        run: |
-
+          sudo apt-get update
-      - name: Build ${{ matrix.arch }}
+          sudo apt-get install build-essential
          sudo apt-get install cmake
      - name: Configure
        run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
      - name: Build
        run: |
          docker run --platform ${{ matrix.arch }} --rm \
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            apt update
            apt install -y build-essential cmake
            cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
          make
-            ctest -L gh --output-on-failure'
+          ctest -L gh --output-on-failure
  windows:
    runs-on: windows-latest
@ -166,7 +134,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v1
@ -227,7 +195,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v1
@ -293,7 +261,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
      - name: Add msbuild to PATH
        uses: microsoft/setup-msbuild@v1
@ -340,16 +308,24 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
-      - name: Setup emsdk
+      - name: Dependencies
-        uses: mymindstorm/setup-emsdk@v12
+        run: |
          wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
          tar -xvf master.tar.gz
          emsdk-master/emsdk update
          emsdk-master/emsdk install latest
          emsdk-master/emsdk activate latest
-      - name: Verify
+      - name: Configure
-        run: emcc -v
+        run: echo "tmp"
      - name: Build
        run: |
          pushd emsdk-master
          source ./emsdk_env.sh
          popd
          emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
          make
@ -362,7 +338,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
      - name: Configure
        run: |
@ -380,7 +356,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v1
      - name: Install Java
        uses: actions/setup-java@v3
@ -400,7 +376,7 @@ jobs:
    needs: [ 'windows' ]
    runs-on: windows-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v1
      - name: Install Java
        uses: actions/setup-java@v1
@ -426,24 +402,11 @@ jobs:
          name: whispercpp.jar
          path: bindings/java/build/libs/whispercpp-*.jar
-      - name: Publish package
+#      - name: Publish package
-        if: ${{ github.ref == 'refs/heads/master' }}
+#        if: ${{ github.ref == 'refs/heads/master' }}
-        uses: gradle/gradle-build-action@v2
+#        uses: gradle/gradle-build-action@v2
-        with:
+#        with:
-          arguments: publish
+#          arguments: publish
-        env:
+#        env:
-          MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
+#          MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
-          MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
+#          MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
  quantize:
    runs-on: ubuntu-latest
    steps:
      - name: Clone
        uses: actions/checkout@v3
      - name: Test quantize
        run: |
          ./models/download-ggml-model.sh tiny.en
          make quantize
          ./quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
--- a/.gitignore
+++ b/.gitignore
@ -24,7 +24,6 @@ build-sanitize-thread/
 /talk-llama
 /bench
 /quantize
 /lsp
 arm_neon.h
 sync.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -65,7 +65,6 @@ else()
    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor" Generic)
    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"     OFF)
    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"  OFF)
    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS" OFF)
    option(WHISPER_CLBLAST               "whisper: use CLBlast"         OFF)
 endif()
@ -137,17 +136,6 @@ if (WHISPER_OPENBLAS)
 endif()
 if (WHISPER_BLAS)
    if (WIN32)
        if(DEFINED ENV{OPENBLAS_PATH})
            set(BLAS_LIBRARIES $ENV{OPENBLAS_PATH}/lib/libopenblas.dll.a)
            message(STATUS "Libraries ${BLAS_LIBRARIES}")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
            include_directories($ENV{OPENBLAS_PATH}/include)
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
        else ()
            message(WARNING "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
        endif ()
    else ()
    set(BLA_STATIC 1)
    set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
 #    set(BLA_PREFER_PKGCONFIG 1)
@ -157,15 +145,14 @@ if (WHISPER_BLAS)
    if(BLAS_FOUND)
        message(STATUS "BLAS compatible library found")
        message(STATUS "Libraries ${BLAS_LIBRARIES}")
            find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas /usr/local/include/openblas $ENV{BLAS_HOME}/include)
        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
        include_directories(${BLAS_INCLUDE_DIRS})
        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
    else()
        message(WARNING "BLAS library was not found")
    endif()
 endif ()
 endif ()
 if (WHISPER_CUBLAS)
    cmake_minimum_required(VERSION 3.17)
@ -192,37 +179,6 @@ if (WHISPER_CUBLAS)
    endif()
 endif()
 if (WHISPER_HIPBLAS)
    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
    endif()
    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
    endif()
    find_package(hip)
    find_package(hipblas)
    find_package(rocblas)
    if (${hipblas_FOUND} AND ${hip_FOUND})
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
        if (WHISPER_STATIC)
            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
        endif()
        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ggml-rocm)
    else()
        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
    endif()
 endif()
 if (WHISPER_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
@ -281,14 +237,9 @@ message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    message(STATUS "ARM detected")
 elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
    message(STATUS "PowerPC detected")
 else()
    message(STATUS "x86 detected")
    if (MSVC)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
            if(NOT WHISPER_NO_AVX2)
                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
--- a/123
+++ b/123
@ -12,12 +12,6 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 ifndef NVCC_VERSION
 	ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0)
 		NVCC_VERSION := $(shell nvcc --version | egrep -o "V[0-9]+.[0-9]+.[0-9]+" | cut -c2-)
 	endif
 endif
 CCV := $(shell $(CC) --version | head -n 1)
 CXXV := $(shell $(CXX) --version | head -n 1)
@ -57,7 +51,19 @@ endif
 # OS specific
 # TODO: support Windows
-ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$(UNAME_S))
+ifeq ($(UNAME_S),Linux)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),Darwin)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),Haiku)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
@ -67,50 +73,60 @@ endif
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	ifeq ($(UNAME_S),Darwin)
-		CPUINFO_CMD := sysctl machdep.cpu.features
+		CFLAGS += -mf16c
-	else ifeq ($(UNAME_S),Linux)
+		AVX1_M := $(shell sysctl machdep.cpu.features)
-		CPUINFO_CMD := cat /proc/cpuinfo
+		ifneq (,$(findstring FMA,$(AVX1_M)))
-	else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
+			CFLAGS += -mfma
 		CPUINFO_CMD := cat /proc/cpuinfo
 	else ifeq ($(UNAME_S),Haiku)
 		CPUINFO_CMD := sysinfo -cpu
 		endif
-
+		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
 	ifdef CPUINFO_CMD  	
    AVX_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx ")
 		ifneq (,$(findstring avx,$(AVX_M)))
 			CFLAGS += -mavx
 		endif
-    
+		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
-		AVX2_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx2 ")
+		ifneq (,$(findstring AVX2,$(AVX2_M)))
 			CFLAGS += -mavx2
 		endif
 	else ifeq ($(UNAME_S),Linux)
 		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
 		ifneq (,$(findstring avx2,$(AVX2_M)))
 			CFLAGS += -mavx2
 		endif
-
+		FMA_M := $(shell grep "fma " /proc/cpuinfo)
 		FMA_M := $(shell $(CPUINFO_CMD) | grep -m 1 "fma ")
 		ifneq (,$(findstring fma,$(FMA_M)))
 			CFLAGS += -mfma
 		endif
-
+		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
 		F16C_M := $(shell $(CPUINFO_CMD) | grep -m 1 "f16c ")
 		ifneq (,$(findstring f16c,$(F16C_M)))
 			CFLAGS += -mf16c
-			AVX1_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx ")
+			AVX1_M := $(shell grep "avx " /proc/cpuinfo)
 			ifneq (,$(findstring avx,$(AVX1_M)))
 				CFLAGS += -mavx
 			endif
 		endif
-
+		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
 		SSE3_M := $(shell $(CPUINFO_CMD) | grep -m 1 "sse3 ")
 		ifneq (,$(findstring sse3,$(SSE3_M)))
 			CFLAGS += -msse3
 		endif
-
+	else ifeq ($(UNAME_S),Haiku)
-		SSSE3_M := $(shell $(CPUINFO_CMD) | grep -m 1 "ssse3 ")
+		AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
-		ifneq (,$(findstring ssse3,$(SSSE3_M)))
+		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mssse3
+			CFLAGS += -mavx2
 		endif
 		FMA_M := $(shell sysinfo -cpu | grep "FMA ")
 		ifneq (,$(findstring fma,$(FMA_M)))
 			CFLAGS += -mfma
 		endif
 		F16C_M := $(shell sysinfo -cpu | grep "F16C ")
 		ifneq (,$(findstring f16c,$(F16C_M)))
 			CFLAGS += -mf16c
 			AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
 			ifneq (,$(findstring avx,$(AVX1_M)))
 				CFLAGS += -mavx
 			endif
 		endif
 	else
 		CFLAGS += -mfma -mf16c -mavx -mavx2
 	endif
 endif
 ifeq ($(UNAME_M),amd64)
@ -146,56 +162,29 @@ endif
 endif
 ifdef WHISPER_OPENBLAS
-	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
 ifdef WHISPER_CUBLAS
 	ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
 		CUDA_ARCH_FLAG=native
 	else
 		CUDA_ARCH_FLAG=all
 	endif
 	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
 	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
 	WHISPER_OBJ += ggml-cuda.o
 	NVCC        = nvcc
-	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
+	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=any
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif
 ifdef WHISPER_HIPBLAS
 	ROCM_PATH   ?= /opt/rocm
 	HIPCC       ?= $(ROCM_PATH)/bin/hipcc
 	GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
 	CFLAGS      += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 	CXXFLAGS    += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
 	LDFLAGS     += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
 	LDFLAGS     += -lhipblas -lamdhip64 -lrocblas
 	HIPFLAGS    += $(addprefix --offload-arch=,$(GPU_TARGETS))
 	WHISPER_OBJ += ggml-cuda.o
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
 endif
 ifdef WHISPER_CLBLAST
 	CFLAGS 		+= -DGGML_USE_CLBLAST
-	CXXFLAGS 	+= -DGGML_USE_CLBLAST
+	LDFLAGS	 	+= -lclblast -lOpenCL
 	LDFLAGS	 	+= -lclblast
 	ifeq ($(UNAME_S),Darwin)
 		LDFLAGS	 	+= -framework OpenCL
 	else
 		LDFLAGS	    += -lOpenCL
 	endif
 	WHISPER_OBJ	+= ggml-opencl.o
 ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
-	$(CXX) $(CXXFLAGS) -c $< -o $@
+	$(CC) $(CFLAGS) -c $< -o $@
 endif
 ifdef WHISPER_GPROF
@ -273,7 +262,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 clean:
-	rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
 #
 # Examples
@ -300,9 +289,6 @@ stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHIS
 command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
 lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o lsp $(CC_SDL) $(LDFLAGS)
 talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
@ -323,7 +309,6 @@ samples:
 	@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
 	@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
 	@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
 	@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
 	@echo "Converting to 16-bit WAV ..."
 	@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
 	@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
@ -333,8 +318,6 @@ samples:
 	@rm samples/mm1.wav
 	@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
 	@rm samples/a13.mp3
 	@ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
 	@rm samples/diffusion2023-07-03.flac
 #
 # Models
@ -376,4 +359,4 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
 .PHONY: tests
 tests:
-	bash ./tests/run-tests.sh $(word 2, $(MAKECMDGOALS))
+	bash ./tests/run-tests.sh
--- a/README.md
+++ b/README.md
@ -22,7 +22,6 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
 - [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
 - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
 Supported platforms:
@ -61,7 +60,7 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 - Various other examples are available in the [examples](examples) folder
 The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
-intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
+instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
 the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
 ## Quick start
@ -312,85 +311,6 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
 For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
 ## OpenVINO support
 On platforms that support [OpenVINO](https://github.com/openvinotoolkit/openvino), the Encoder inference can be executed
 on OpenVINO-supported devices including x86 CPUs and Intel GPUs (integrated & discrete).
 This can result in significant speedup in encoder performance. Here are the instructions for generating the OpenVINO model and using it with `whisper.cpp`:
 - First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.
  Windows:
  ```
  cd models
  python -m venv openvino_conv_env
  openvino_conv_env\Scripts\activate
  python -m pip install --upgrade pip
  pip install -r openvino-conversion-requirements.txt
  ```
  Linux and macOS:
  ```
  cd models
  python3 -m venv openvino_conv_env
  source openvino_conv_env/bin/activate
  python -m pip install --upgrade pip
  pip install -r openvino-conversion-requirements.txt
  ```
 - Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
  ```
  python convert-whisper-to-openvino.py --model base.en
  ```
  This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as ggml models, as that
  is the default location that the OpenVINO extension will search at runtime.
 - Build `whisper.cpp` with OpenVINO support:
  Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
  After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
  Linux:
  ```bash
  source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
  ```
  Windows (cmd):
  ```
  C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
  ```
  And then build the project using cmake:
  ```bash
  cd build
  cmake -DWHISPER_OPENVINO=1 ..
  ```
 - Run the examples as usual. For example:
  ```bash
  ./main -m models/ggml-base.en.bin -f samples/jfk.wav
  ...
  whisper_ctx_init_openvino_encoder: loading OpenVINO model from 'models/ggml-base.en-encoder-openvino.xml'
  whisper_ctx_init_openvino_encoder: first run on a device may take a while ...
  whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = GPU, cache_dir = models/ggml-base.en-encoder-openvino-cache
  whisper_ctx_init_openvino_encoder: OpenVINO model loaded
  system_info: n_threads = 4 / 8 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 1 |
  ...
  ```
  The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
  cached for the next run.
 For more information about the Core ML implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
 ## NVIDIA GPU support via cuBLAS
 With NVIDIA cards the Encoder processing can to a large extent be offloaded to the GPU through cuBLAS.
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -19,10 +19,6 @@ func (p *Params) SetTranslate(v bool) {
 	p.translate = toBool(v)
 }
 func (p *Params) SetSplitOnWord(v bool) {
 	p.split_on_word = toBool(v)
 }
 func (p *Params) SetNoContext(v bool) {
 	p.no_context = toBool(v)
 }
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -81,10 +81,6 @@ func (context *context) SetSpeedup(v bool) {
 	context.params.SetSpeedup(v)
 }
 func (context *context) SetSplitOnWord(v bool) {
        context.params.SetSplitOnWord(v)
 }
 // Set number of threads to use
 func (context *context) SetThreads(v uint) {
 	context.params.SetThreads(int(v))
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -42,7 +42,6 @@ type Context interface {
 	SetDuration(time.Duration)    // Set duration
 	SetThreads(uint)              // Set number of threads to use
 	SetSpeedup(bool)              // Set speedup flag
 	SetSplitOnWord(bool)          // Set split on word flag
 	SetTokenThreshold(float32)    // Set timestamp token probability threshold
 	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
 	SetMaxSegmentLength(uint)     // Set max segment length in characters
--- a/coreml/whisper-decoder-impl.h
+++ b/coreml/whisper-decoder-impl.h
@ -31,10 +31,10 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
 API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
-/// var_1346 as multidimensional array of floats
+/// var_1195 as multidimensional array of floats
-@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
+@property (readwrite, nonatomic, strong) MLMultiArray * var_1195;
 - (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
+- (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 NS_DESIGNATED_INITIALIZER;
@end
--- a/coreml/whisper-decoder-impl.m
+++ b/coreml/whisper-decoder-impl.m
@ -39,21 +39,21 @@
@implementation whisper_decoder_implOutput
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
+- (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 {
    self = [super init];
    if (self) {
-        _var_1346 = var_1346;
+        _var_1195 = var_1195;
    }
    return self;
 }
 - (NSSet<NSString *> *)featureNames {
-    return [NSSet setWithArray:@[@"var_1346"]];
+    return [NSSet setWithArray:@[@"var_1195"]];
 }
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
-    if ([featureName isEqualToString:@"var_1346"]) {
+    if ([featureName isEqualToString:@"var_1195"]) {
-        return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
+        return [MLFeatureValue featureValueWithMultiArray:self.var_1195];
    }
    return nil;
 }
@ -177,7 +177,7 @@
 - (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
    if (!outFeatures) { return nil; }
-    return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
+    return [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[outFeatures featureValueForName:@"var_1195"].multiArrayValue];
 }
 - (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
@ -192,7 +192,7 @@
    NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
    for (NSInteger i = 0; i < outBatch.count; i++) {
        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
-        whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
+        whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[resultProvider featureValueForName:@"var_1195"].multiArrayValue];
        [results addObject:result];
    }
    return results;
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -53,12 +53,10 @@ void whisper_coreml_encode(
                                           error: nil
    ];
    @autoreleasepool {
    whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
    memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
 }
 }
 #if __cplusplus
 }
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -69,5 +69,4 @@ else()
    add_subdirectory(quantize)
    add_subdirectory(talk)
    add_subdirectory(talk-llama)
    add_subdirectory(lsp)
 endif()
--- a/examples/lsp/CMakeLists.txt
+++ b/examples/lsp/CMakeLists.txt
@ -1,9 +0,0 @@
 if (WHISPER_SDL2)
    # stream
    set(TARGET lsp)
    add_executable(${TARGET} lsp.cpp)
    include(DefaultTargetOptions)
    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/lsp/README.md
+++ b/examples/lsp/README.md
@ -1,104 +0,0 @@
 # Language Server
 This example consists of a simple language server to expose both unguided
 and guided (command) transcriptions by sending json messages over stdout/stdin
 as well as a rather robust vim plugin that makes use of the language server.
 ## Vim plugin quick start
 Compile the language server with
 ```bash
 make lsp
 ```
 Install the plugin itself by copying or symlinking whisper.vim into ~/.vim/autoload/
 In your vimrc, set the path of your whisper.cpp directory and optionally add some keybinds.
 ```vim
 let g:whisper_dir = "~/whisper.cpp"
 " Start listening for commands when Ctrl - g is pressed in normal mode
 nnoremap <C-G> call whisper#requestCommands()<CR>
 " Start unguided transcription when Ctrl - g is pressed in insert mode
 inoremap <C-G> <Cmd>call whisper#doTranscription()<CR>
 ```
 ## Vim plugin usage
 The vim plugin was designed to closely follow the mnemonics of vim
 `s:spoken_dict` is used to translate keys to their spoken form.
 Keys corresponding to a string use that spoken value normally and when a motion is expected, but use the key itself when a character is expected.  
 Keys corresponding to a dict, like `i`, can have manual difinitions given to each possible commandset.
 0 is normal (insert), 1 is motion (inside), 2 is it's usage as a single key ([till] i), and 3 is it's usage in an area selection (s -> [around] sentence)
 Some punctuation items, like `-` are explicitly given pronunciations to prevent them from being picked as punctuation instead of an actual command word.
 Not all commands will tokenize to a single token and this can interfere with interpretation. "yank" as an example, takes multiple tokens and correspondingly, will give more accurate detection when only the first "ya" is used. While it could be changed to something else that is a single token (copy), value was placed on maintaining vim mnemonics.
 Commands that would normally move the editor into insert mode (insert, append, open, change) will begin unguided transcription.
 Unguided transcription will end when a speech segment ends in exit.
 Presence of punctuation can be designated by whether or not you add a pause between the previous speech segment and exit.
 Exiting only occurs if exit is the last word, so "Take the first exit on your right" would not cause transcription to end.
 After a command is evaluated, the plugin will continue listening for the next command.
 While in command mode, "Exit" will end listening.
 A best effort approach is taken to keep track of audio that is recorded while a previous chunk is still processing and immediately interpret it afterwards, but the current voice detection still needs a fairly sizable gap to determine when a command has been spoken.
 Log information is sent to a special `whisper_log` buffer and can be accessed with
 ```vim
 :e whisper_log
 ```
 ## Vim plugin configuration
 `g:whisper_dir`  
 A full path to the whisper.cpp repo. It can be expanded in the definition like so:
 ```vim
 let g:whisper_dir = expand("~/whisper.cpp/")
 ```
 (The WHISPER_CPP_HOME environment variable is also checked for users of the existing whisper.nvim script)
 `g:whisper_lsp_path`  
 Can be used to manually set the path to the language server.
 If not defined, it will be inferred from the above whisper_dir
 `g:whisper_model_path`  
 A full path to the model to load. If not defined, it will default to ggml-base.en.bin
 `g:whisper_user_commands`  
 A dictionary of spoken commands that correspond to either strings or funcrefs.
 This can be used to create connections with other user plugins, for example
 ```vim
 let g:whisper_user_commands = {"gen": "llama#doLlamaGen"}
 ```
 will trigger the llama.cpp plugin to begin generation when "gen" is spoken
 ## Language server methods
 `registerCommandset`  
 `params` is a list of strings that should be checked for with this commandset. The server prepends a space to these strings before tokenizing.  
 Responds with  
 `result.index` an integer index for the commandset registered, which should be included when initiating a guided transcription to select this commandset.
 Will return an error if any of the commands in the commandset have duplicate tokenizations
 `guided`  
 `params.commandset_index` An index returned by a corresponding commandset registration. If not set, the most recently registered commandset is used.
 `params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.  
 Responds with  
 `result.command_index` The numerical index (starting from 0) of the detected command in the selected commandset
 `result.command_text` A string containing the command as provided in the commandset
 `result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
 `unguided`  
 `params.no_context` Sets the corresponding whisper `no_context` param. Defaults to true. Might provide more accurate results for consecutive unguided transcriptions if those after the first are set to false.
 `params.prompt` If provided, sets the initial prompt used during transcription.
 `params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.  
 Responds with  
 `result.transcription` A string containing the transcribed text.  N.B. This will almost always start with a space due to how text is tokenized.
 `result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
--- a/examples/lsp/json.hpp
+++ b/examples/lsp/json.hpp
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -1,458 +0,0 @@
 #include "common.h"
 #include "common-sdl.h"
 #include "whisper.h"
 #include "json.hpp"
 #include <iostream>
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <deque>
 #include <set>
 using json = nlohmann::json;
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t prompt_ms  = 5000;
    int32_t command_ms = 8000;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;
    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
 };
 struct command {
    std::vector<whisper_token> tokens;
    std::string plaintext;
 };
 struct commandset {
    std::vector<struct command> commands;
    std::vector<whisper_token> prompt_tokens;
    // TODO: Store longest command?
    // Multi-token commands should have probabilities of subsequent logits
    // given that the prior logit is correct.
    // In this case, all commands must be iterated.
    // This however, is likely highly involved as different tokens
    // almost certainly have different spoken lengths
    // It would also have performance implications equivalent to a beam search
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
        else if (arg == "-pms" || arg == "--prompt-ms")     { params.prompt_ms     = std::stoi(argv[++i]); }
        else if (arg == "-cms" || arg == "--command-ms")    { params.command_ms    = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,         --help           [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,       --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -pms N,     --prompt-ms N    [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
    fprintf(stderr, "  -cms N,     --command-ms N   [%-7d] command duration in milliseconds\n",            params.command_ms);
    fprintf(stderr, "  -c ID,      --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,      --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "\n");
 }
 uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
    using namespace std::chrono;
    uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
    uint64_t start_time = time_now;
    if (jparams.contains("timestamp")) {
        start_time = jparams.at("timestamp");
    }
    if(time_now - start_time < 500) {
        //wait for a backlog of audio
        std::this_thread::sleep_for(milliseconds(500 - (time_now - start_time)));
        time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
    } else if (time_now - start_time > 1000) {
        audio.get(time_now-start_time, pcmf32);
        size_t max_offset = pcmf32.size() - WHISPER_SAMPLE_RATE;
        for(size_t offset=0;offset < max_offset;offset+=WHISPER_SAMPLE_RATE/10) {
            std::vector<float> audio_chunk(&pcmf32[offset], &pcmf32[offset+WHISPER_SAMPLE_RATE]);
            if(::vad_simple(audio_chunk, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                pcmf32.resize(offset+WHISPER_SAMPLE_RATE);
                if (offset*1000/WHISPER_SAMPLE_RATE+1000 > maxlength_ms) {
                    //remove samples from the beginning
                    pcmf32.erase(pcmf32.begin(),pcmf32.end()-(maxlength_ms*WHISPER_SAMPLE_RATE/1000));
                    fprintf(stderr, "Shortened samples");
                }
                return start_time + offset*1000/WHISPER_SAMPLE_RATE+1000;
            }
        }
    }
    size_t window_duration = std::max((uint64_t)1000, time_now-start_time);
    audio.get(window_duration, pcmf32);
    while (!::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
        std::this_thread::sleep_for(milliseconds(100));
        time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
        window_duration = std::max((uint64_t)1000,time_now-start_time);
        audio.get(window_duration, pcmf32);
    }
    if (time_now - start_time > maxlength_ms) {
        audio.get(maxlength_ms, pcmf32);
    } else {
        audio.get(time_now - start_time, pcmf32);
    }
    return time_now;
 }
 json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
    std::vector<whisper_token> prompt_tokens;
    std::vector<float> pcmf32;
    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    if (jparams.contains("prompt")) {
        // unlikely to see much use. Under normal circumstances, no_context would be set to false
        std::string prompt = jparams.at("prompt");
        prompt_tokens.resize(1024);
        int n = whisper_tokenize(ctx, prompt.c_str(), prompt_tokens.data(), 1024);
        prompt_tokens.resize(n);
        wparams.prompt_tokens    = prompt_tokens.data();
        wparams.prompt_n_tokens  = prompt_tokens.size();
    }
    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
    wparams.print_realtime   = false;
    wparams.print_timestamps = false;
    wparams.translate        = params.translate;
    wparams.no_context       = jparams.value("no_context", true);
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;
    wparams.suppress_non_speech_tokens = true;
    // run the transformer and a single decoding pass
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
        throw json{
            {"code", -32803},
            {"message", "ERROR: whisper_full() failed"}
        };
    }
    std::string result = whisper_full_get_segment_text(ctx,0);
    return json {
        {"transcription", result},
        {"timestamp", unprocessed_audio_timestamp}
    };
 }
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
 json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
    struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
    std::vector<float> pcmf32;
    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
    fprintf(stderr, "%s: Speech detected! Processing ...\n", __func__);
    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
    wparams.print_realtime   = false;
    wparams.print_timestamps = false;
    wparams.translate        = params.translate;
    wparams.no_context       = true;
    wparams.single_segment   = true;
    wparams.max_tokens       = 1;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;
    // TODO: Do some time testing. Does an overly long prompt slow down processing?
    // Set up command sets/precompute prompts
    wparams.prompt_tokens    = cs.prompt_tokens.data();
    wparams.prompt_n_tokens  = cs.prompt_tokens.size();
    // TODO: properly expose as option
    wparams.suppress_non_speech_tokens = true;
    // run the transformer and a single decoding pass
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
        throw json{
            {"code", -32803},
            {"message", "ERROR: whisper_full() failed"}//TODO: format string (sprintf?)
        };
    }
    // estimate command probability
    // NOTE: not optimal
    {
        const auto * logits = whisper_get_logits(ctx);
        std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
        // compute probs from logits via softmax
        {
            float max = -1e9;
            for (int i = 0; i < (int) probs.size(); ++i) {
                max = std::max(max, logits[i]);
            }
            float sum = 0.0f;
            for (int i = 0; i < (int) probs.size(); ++i) {
                probs[i] = expf(logits[i] - max);
                sum += probs[i];
            }
            for (int i = 0; i < (int) probs.size(); ++i) {
                probs[i] /= sum;
            }
        }
        std::vector<std::pair<float, int>> probs_id;
        // In my testing, the most verbose token is always the desired.
        // TODO: Trim commandset struct once efficacy has been verified
        for (int i = 0; i < (int) cs.commands.size(); ++i) {
            probs_id.emplace_back(probs[cs.commands[i].tokens[0]], i);
        }
        // sort descending
        {
            using pair_type = decltype(probs_id)::value_type;
            std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
                    return a.first > b.first;
                    });
        }
        int id = probs_id[0].second;
        return json{
            {"command_index", id},
                {"command_text", cs.commands[id].plaintext},
                {"timestamp", unprocessed_audio_timestamp},
        };
    }
 }
 json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
    // TODO: check for token collision
    struct commandset cs;
    std::string  k_prompt = " select one from the available words: ";
    std::set<whisper_token> token_set;
    whisper_token tokens[32];
    for (std::string s : jparams) {
        std::vector<whisper_token> token_vec;
        // The existing command implementation uses a nested for loop to tokenize single characters
        // I fail to see the purpose of this when ' a' has a wholly different pronunciation than the start of ' apple'
        const int n = whisper_tokenize(ctx, (" " + s).c_str(), tokens, 32);
        if (n < 0) {
            fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, s.c_str());
            return 3;
        }
        token_vec.push_back(tokens[0]);
        if (!token_set.insert(tokens[0]).second) {
            fprintf(stderr, "%s: warning: %s is a duplicate of an existing token\n", __func__, s.c_str());
            throw json{
                {"code",-31000},
                {"message", "Duplicate token in token set: " + s}
            };
        }
        if (n > 1) {// empty string if n=0? Should never occur
            fprintf(stderr, "%s: error: command is more than a single token: %s\n", __func__, s.c_str());
        }
        struct command command = {token_vec, s};
        cs.commands.push_back(command);
        k_prompt += s;
    }
    k_prompt = k_prompt.substr(0,k_prompt.length()-2) + ". Selected word:";
    cs.prompt_tokens.resize(1024);
    int n = whisper_tokenize(ctx, k_prompt.c_str(), cs.prompt_tokens.data(), 1024);
    cs.prompt_tokens.resize(n);
    // prepare response
    int index = commandset_list.size();
    commandset_list.push_back(cs);
    return json{{"index",index}};
 }
 json seek(struct whisper_context * ctx, audio_async &audio, json params) {
    // whisper_state has the pertinent offsets, but there also seem to be a large
    // number of scratch buffers that would prevent rewinding context in a manner similar to llama
    // I'll give this a another pass once everything else is implemented,
    // but for now, it's unsupported
    throw json{
        {"code", -32601},
            {"message", "Seeking is not yet supported."}
    };
 }
 json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
    // See: https://www.jsonrpc.org/specification
    json id = body.at("id");
    try {
        std::string version = body.at("jsonrpc");
        if (version != "2.0") {
            // unsupported version
            throw json{
                {"code", -3260},
                {"message", "invalid jsonrpc version"}
            };
        }
        std::string method = body.at("method");
        json jparams = json{{"dummy", "dummy"}};
        if (body.contains("params"))
            jparams = body.at("params");
        json res;
        // TODO: be consistent about argument order
        fprintf(stderr, "Dispatching a job\n");
        if (method == "unguided")                { res = unguided_transcription(ctx, audio, jparams, params); }
        else if (method == "guided")             { res = guided_transcription(ctx, audio, params, jparams, commandset_list); }
        else if (method == "seek")               { res = seek(ctx, audio, jparams); }
        else if (method == "registerCommandset") { res = register_commandset(ctx, jparams, commandset_list); }
        else if (method == "echo")               { res = jparams; }
        return json{
            {"jsonrpc", "2.0"},
                {"result", res},
                {"id", id}
        };
    } catch(json ex) {
        return json {
            {"jsonrpc", "2.0"},
                {"error", ex},
                {"id", id}
        };
    }
 }
 void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    std::deque<json> jobqueue;
    std::vector<struct commandset> commandset_list;
    while (true) {
        // For eventual cancellation support, shouldn't block if job exists
        if (std::cin.rdbuf()->in_avail() > 22 || jobqueue.size() == 0) {
            int content_length;
            if (scanf("Content-Length: %d", &content_length) != 1) {
                fprintf(stderr, "Could not read input: %d", std::cin.peek());
                return;
            }
            // scanf leaves the new lines intact
            std::cin.ignore(2);
            if (std::cin.peek() != 13) {
                // Content-Type. jsonrpc necessitates utf8.
                std::cin.ignore(200,10);
            }
            std::cin.ignore(2);
            // A message is being sent and blocking is acceptable
            std::string content(content_length,'\0');
            std::cin.read(&content[0], content_length);
            json job = json::parse(content);
            // TODO: Some messages(cancellation) should skip queue here
            if (job.is_array()) {
                // response must also be batched. Will implement later
                // for (subjob : job.begin())
                // TODO: At the very least respond with an unsupported error.
            } else {
                jobqueue.push_back(job);
            }
        }
        assert(jobqueue.size() > 0);
        json job = jobqueue.front();
        json resp = parse_job(job, ctx, audio, params, commandset_list);
        if (resp != "unfinished") {
            jobqueue.pop_front();
            // send response
            std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
            fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
            std::cout.flush();
        }
    }
 }
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }
    // whisper init
    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
    // init audio
    audio_async audio(30*1000);
    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
        return 1;
    }
    audio.resume();
    // TODO: Investigate why this is required. An extra second of startup latency is not great
    // wait for 1 second to avoid any buffered noise
    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
    audio.clear();
    // TODO: consider some sort of indicator to designate loading has finished?
    // Potentially better for the client to just start with a non-blocking message (register commands)
    process_loop(ctx, audio, params);
    audio.pause();
    whisper_print_timings(ctx);
    whisper_free(ctx);
    return 0;
 }
--- a/examples/lsp/whisper.vim
+++ b/examples/lsp/whisper.vim
@ -1,362 +0,0 @@
 if !exists("g:whisper_dir")
    let g:whisper_dir = expand($WHISPER_CPP_HOME)
    if g:whisper_dir == ""
        echoerr "Please provide a path to the whisper.cpp repo in either the $WHISPER_CPP_HOME environment variable, or g:whisper_dir"
    endif
 endif
 if !exists("g:whisper_lsp_path")
    let g:whisper_lsp_path = g:whisper_dir .. "lsp"
    if !filereadable(g:whisper_lsp_path)
        echoerr "Was not able to locate a lsp executable at: " .. g:whisper_lsp_path
        throw "Executable not found"
    endif
 endif
 if !exists("g:whisper_model_path")
    " TODO: allow custom paths relative to the repo dir
    let g:whisper_model_path = g:whisper_dir .. "models/ggml-base.en.bin"
    if !filereadable(g:whisper_model_path)
        echoerr "Could not find model at: " .. g:whisper_model_path
        throw "Model not found"
    endif
 endif
 let s:output_buffer = bufnr("whisper_log", v:true)
 call setbufvar(s:output_buffer,"&buftype","nofile")
 let s:lsp_command = [g:whisper_lsp_path,"-m",g:whisper_model_path]
 " For faster execution. TODO: server load multiple models/run multiple servers?
 " let s:lsp_command = [g:whisper_lsp_path, "-m", g:whisper_dir .. "models/ggml-tiny.en.bin", "-ac", "128"]
 " requestCommands([params_dict])
 func whisper#requestCommands(...)
    let l:req = {"method": "guided", "params": {"commandset_index": 0}}
    if a:0 > 0
        call extend(l:req.params, a:1)
    endif
    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback", [l:req.params, 0])})
 endfunction
 " doTranscription([params_dict])
 func whisper#doTranscription(...)
    let l:req = {"method": "unguided", "params": {}}
    if a:0 > 0
        call extend(l:req.params, a:1)
    endif
    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:transcriptionCallback", [function("s:insertText"),function("s:endTranscription")])})
 endfunction
 " For testing
 func whisper#uppertest(cha)
    echo tr(a:cha, s:c_lowerkeys, s:c_upperkeys)
 endfunction
 " (upper, exit, count, motion, command, insert/append, save run) "base"
 " (upper, exit, count, motion, command, inside/around)           "motion/visual"
 " (upper, exit, count, motion, line,    inside/around)           "command already entered"
 " (upper, exit, key,                                 )           "from/till"
 " upper and lower keys is used to translate between cases with tr
 " Must be sunchronized
 let s:c_lowerkeys = "1234567890-=qwertyuiop[]\\asdfghjkl;'zxcvbnm,./\""
 let s:c_upperkeys = "!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?'"
 let s:c_count = split("1234567890\"",'\zs')
 let s:c_command = split("ryuogpdxcv.iam", '\zs')
 let s:c_motion = split("wetf'hjklnb$^)",'\zs')
 " object words: Word, Sentence, Paragraph, [, (, <, Tag, {. ", '
 let s:c_area = split("wsp])>t}\"'",'\zs')
 "Special commands.
 let s:c_special_always = ["exit", "upper"]
 let s:c_special_normal = ["save", "run", "space"]
 " If not in dict, key is spoken word,
 " If key resolves to string, value is used for normal/motion, but key for chars
 " If key resolves to dict, {0: "normal",1: "motion",2:"single char",3: "area"}
 " Missing entries fall back as follows {0: "required", 1: 0, 2: "key", 3: 0}
 let s:spoken_dict = {"w": "word", "e": "end", "r": "replace", "t": {0: "till", 3: "tag"}, "y": "yank", "u": "undo", "i": {0: "insert", 1: "inside"}, "o": "open", "p": {0: "paste", 3: "paragraph"},  "a": {0: "append", 1: "around"}, "s": {0: "substitute", 3: "sentence"}, "d": "delete", "f": "from", "g": "go", "h": "left", "j": "down", "k": "up", "l": "right", "c": "change", "v": "visual", "b": "back", "n": "next", "m": "mark", ".": {0: "repeat", 2: "period"}, "]": {0: "bracket", 2: "bracket"}, "'": {0: "jump", 2: "apostrophe", 3:  "apostrophe"}, '"': {0: 'register', 2: "quotation", 3: "quotation"}, "-": {0: "minus", 2: "minus"}, "$": {0: "dollar", 2: "dollar"}, "^": {0: "carrot", 2: "carrot"}, ")": {0: "sentence", 2: "parenthesis",  3: "parenthesis"}, "}": {0: "paragraph", 2: "brace", 3: "brace"}, ">": {0: "indent", 2: "angle", 3: "angle"}}
 " Give this another pass. This seems overly hacky even if it's functional
 let s:sub_tran_msg = ""
 func s:subTranProg(msg)
    if s:sub_tran_msg != ""
        let s:sub_tran_msg = s:sub_tran_msg .. a:msg
        if mode() !=? 'v'
            exe "normal" "u" .. s:sub_tran_msg
        endif
    else
        if s:command_backlog == ""
            " this should not occur
            call s:logCallback(0, "Warning: Encountered sub transcription without prior command")
            let s:command_backlog = "a"
        endif
        if a:msg[0] == ' '
            let s:sub_tran_msg = s:command_backlog .. a:msg[1:-1]
        else
            let s:sub_tran_msg = s:command_backlog  .. a:msg
        endif
        if mode() !=? 'v'
            exe "normal" s:sub_tran_msg
        endif
    endif
    call appendbufline(s:output_buffer, "$", s:sub_tran_msg ..  ":" .. string(a:msg ))
 endfunction
 func s:subTranFinish(params, timestamp)
    let s:repeat_command = s:sub_tran_msg
    " Visual selection is lot if used with streaming, so streaming of partial
    " transcriptions is disabled in visual mode
    if mode() ==? 'v'
        exe "normal" s:sub_tran_msg
    endif
    let s:sub_tran_msg = ""
    let s:command_backlog = ""
    exe "normal a\<C-G>u"
    let l:params = a:params
    let l:params.timestamp = a:timestamp
    if exists("l:params.commandset_index")
        unlet l:params.commandset_index
    endif
    call whisper#requestCommands(a:params)
 endfunction
 func s:logCallback(channel, msg)
    call appendbufline(s:output_buffer,"$",a:msg)
 endfunction
 func s:transcriptionCallback(progressCallback, finishedCallback, channel, msg)
    let l:tr = a:msg.result.transcription
    let l:ex_ind = match(tolower(l:tr),"exit", len(l:tr)-6)
    " The worst case I've observed so far is " Exit.", which is 6 characters
    if l:ex_ind != -1
        call a:progressCallback(strpart(l:tr,0,l:ex_ind-1))
        call a:finishedCallback(a:msg.result.timestamp)
    else
        call a:progressCallback(l:tr)
        let req = {"method": "unguided", "params": {"timestamp": a:msg.result.timestamp, "no_context": v:true}}
        let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [a:progressCallback, a:finishedCallback])})
    endif
 endfunc
 func s:insertText(msg)
    exe "normal a" .. a:msg
 endfunction
 func s:endTranscription(timestamp)
    call appendbufline(s:output_buffer, "$", "Ending unguided transcription")
 endfunction
 " If a command does not include a whole actionable step, attempting to execute
 " it discards the remainder of things. There is likely a simpler solution,
 " but it can be made functional now by storing a backbuffer until actionable
 let s:command_backlog = ""
 let s:repeat_command = ""
 let s:preceeding_upper = v:false
 func s:commandCallback(params, commandset_index, channel, msg)
    let l:command_index = a:msg.result.command_index
    let l:do_execute = v:false
    let l:next_mode = a:commandset_index
    let l:command = s:commandset_list[a:commandset_index][l:command_index]
    call s:logCallback(0, string(a:msg) .. " " .. a:commandset_index .. " " .. l:command)
    if l:command_index == 0
        "exit
        "if s:command_backlog == ""
        call s:logCallback(0,"Stopping command mode")
        echo "No longer listening"
        let s:command_backlog = ""
        return
        "else
        " Legacy code to clear an existing buffer with exit.
        " Was found to be rarely desired and is better introduced as a
        " standalone command (clear?)
        "   call s:logCallback(0,"Clearing command_backlog" .. s:command_backlog)
        "   let s:command_backlog = ""
        "   let s:preceeding_upper = v:false
        " endif
    elseif l:command_index == 1
        " upper
        let s:preceeding_upper = !s:preceeding_upper
    elseif l:command == "save"
        " save and run can only happen in commandset 0,
        exe "w"
    elseif l:command == "run"
        exe "make run"
    elseif l:command == "space"
        exe "normal i \<ESC>l"
    elseif has_key(s:c_user, l:command)
        let Userfunc = s:c_user[l:command]
        if type(Userfunc) == v:t_string
            let Userfunc = function(Userfunc)
        endif
        call Userfunc()
    else
        if s:preceeding_upper
            " Upper should keep commandset
            let s:preceeding_upper = v:false
            let l:visual_command = tr(l:command, s:c_lowerkeys, s:c_upperkeys)
        else
            let l:visual_command = l:command
        endif
        echo s:command_backlog .. " - " .. l:visual_command
        let s:command_backlog = s:command_backlog .. l:visual_command
        if a:commandset_index == 2 || a:commandset_index == 3
            " single key, either completes motion, replace, or register
            " Should move to execute unless part of a register
            " Change will be caught at execute
            if s:command_backlog[-2:-2] !=# '"'
                call s:logCallback(0,"not register")
                let l:do_execute = v:true
            end
            let l:next_mode = 0
            " commandset index only matters for a/i
        elseif (l:command == "a" || l:command == "i") && a:commandset_index == 1
            " inside/around. Is commandset 3
            let l:next_mode = 3
        elseif l:command ==# '"'
            let l:next_mode = 2
        elseif index(s:c_count, l:command) != -1
            let l:next_mode = a:commandset_index
        elseif index(s:c_motion, l:command) != -1
            if l:command == 't' || l:command == 'f' || l:command == "'"
                " prompt single key
                let l:next_mode = 2
            else
                let l:do_execute = v:true
                let l:next_mode = 0
            endif
        elseif index(s:c_command, l:command) != -1
            if index(["y","g","d","c"], s:command_backlog[-1:-1]) != -1 && s:command_backlog[-1:-1] != s:command_backlog[-2:-2] && mode() !=? 'v'
                " need motion or repeated command
                " Potential for bad state here if disparaging command keys are
                " entered (i.e. yd), but vim can handle checks for this at exe
                " And checking for cases like y123d would complicate things
                let l:next_mode = 1
            elseif index(["i","a","c", "o", "s"], l:command) != -1 || s:command_backlog[-1:-1] ==# 'R'
                "'Insert' mode, do general transcription
                let l:req = {"method": "unguided", "params": a:params}
                let l:req.params.timestamp = a:msg.result.timestamp
                let l:req.params.no_context = v:true
                let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
                return
            elseif l:command == 'r' || l:command == 'm'
                let l:next_mode = 2
            elseif l:command == '.'
                let l:next_mode = 0
                let l:do_execute = v:true
                let s:command_backlog = s:command_backlog[0:-2] .. s:repeat_command
            else
                if l:command ==? 'v'
                    let l:next_mode = 1
                else
                    let l:next_mode = 0
                endif
                let l:do_execute = v:true
            endif
        else
            throw "Invalid command state: " .. l:command .. " " .. a:commandset_index .. " " .. s:command_backlog
        endif
    endif
    if l:do_execute
        if mode() ==?'v' && l:next_mode == 0
            let l:next_mode = 1
        elseif match(s:command_backlog, 'c') != -1
            let l:req = {"method": "unguided", "params": a:params}
            let l:req.params.timestamp = a:msg.result.timestamp
            let l:req.params.no_context = v:true
            let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
            return
        endif
        exe "normal" s:command_backlog
        if index(s:c_motion + ["u"],l:command) == -1
            exe "normal a\<C-G>u"
            let s:repeat_command = s:command_backlog
            call s:logCallback(0, s:command_backlog)
        endif
        let s:command_backlog = ""
    endif
    let l:req = {"method": "guided", "params": a:params}
    let l:req.params.timestamp = a:msg.result.timestamp
    let l:req.params.commandset_index = l:next_mode
    let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback",[a:params, l:next_mode])})
 endfunction
 func s:loadedCallback(channel, msg)
    echo "Loading complete"
    call s:logCallback(a:channel, a:msg)
 endfunction
 func s:registerCommandset(commandlist, is_final)
    let req = {"method": "registerCommandset"}
    let req.params = a:commandlist
    call s:logCallback(0, join(a:commandlist))
    call add(g:whisper_commandlist_spoken, a:commandlist)
    if a:is_final
        let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:loadedCallback"})
    else
        let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:logCallback"})
    endif
 endfunction
 func s:registerAllCommands()
    let l:normal = s:c_special_always + s:c_special_normal + s:c_count + s:c_command + s:c_motion + keys(s:c_user)
    let l:visual = s:c_special_always + s:c_count + s:c_command + s:c_motion
    " Currently the same as visual.
    " let l:post_command = s:c_special_always + s:c_count + s:c_command + s:c_motion
    let l:single_key = s:c_special_always + split(s:c_lowerkeys, '\zs')
    let l:area = s:c_special_always + s:c_area
    " Used only for compatibility with the testing script
    let g:whisper_commandlist_spoken = []
    let s:commandset_list = [l:normal, l:visual, l:single_key, l:area]
    call s:registerCommandset(s:commandsetToSpoken(l:normal, 0), v:false)
    call s:registerCommandset(s:commandsetToSpoken(l:visual, 1), v:false)
    call s:registerCommandset(s:commandsetToSpoken(l:single_key, 2), v:false)
    call s:registerCommandset(s:commandsetToSpoken(l:area, 3), v:true)
 endfunction
 func s:commandsetToSpoken(commandset, spoken_index)
    let l:spoken_list = []
    for l:command in a:commandset
        if has_key(s:spoken_dict, l:command)
            let l:spoken_value = s:spoken_dict[l:command]
            if type(l:spoken_value) == v:t_dict
                if has_key(l:spoken_value, a:spoken_index)
                    let l:spoken_value = l:spoken_value[a:spoken_index]
                else
                    if a:spoken_index == 2
                        let l:spoken_value = l:command
                    else
                        let l:spoken_value = l:spoken_value[0]
                    endif
                endif
            else
                if a:spoken_index == 2
                    let l:spoken_value = l:command
                endif
            endif
        else
            let l:spoken_value = l:command
        endif
        call add(l:spoken_list, l:spoken_value)
    endfor
    return l:spoken_list
 endfunction
 " TODO: Check lifetime. If the script is resourced, is the existing
 " s:lsp_job dropped and therefore killed?
 " This seems to not be the case and I've had to deal with zombie processes
 " that survive exiting vim, even though said behavior conflicts with my
 " understanding of the provided documentation
 let s:lsp_opts = {"in_mode": "lsp", "out_mode": "lsp", "err_mode": "nl", "err_io": "buffer", "err_buf": s:output_buffer}
 if !exists("g:lsp_job")
    if exists("g:whisper_user_commands")
        let s:c_user = g:whisper_user_commands
    else
        let s:c_user = {}
    endif
    let g:lsp_job = job_start(s:lsp_command, s:lsp_opts)
    if job_status(g:lsp_job) == "fail"
        echoerr "Failed to start whisper job"
    endif
    call s:registerAllCommands()
 endif
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -59,7 +59,6 @@ struct whisper_params {
    int32_t offset_t_ms  =  0;
    int32_t offset_n     =  0;
    int32_t duration_ms  =  0;
    int32_t progress_step =  5;
    int32_t max_context  = -1;
    int32_t max_len      =  0;
    int32_t best_of      =  2;
@ -70,7 +69,6 @@ struct whisper_params {
    float logprob_thold = -1.00f;
    bool speed_up        = false;
    bool debug_mode      = false;
    bool translate       = false;
    bool detect_language = false;
    bool diarize         = false;
@ -88,7 +86,6 @@ struct whisper_params {
    bool print_colors    = false;
    bool print_progress  = false;
    bool no_timestamps   = false;
    bool log_score       = false;
    std::string language  = "en";
    std::string prompt;
@ -136,8 +133,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
+        else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
        else if (arg == "-tdrz" || arg == "--tinydiarize")     { params.tinydiarize     = true; }
@ -162,7 +158,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -192,8 +187,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
    fprintf(stderr, "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
@ -217,7 +211,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
    fprintf(stderr, "\n");
 }
@ -225,7 +218,6 @@ struct whisper_print_user_data {
    const whisper_params * params;
    const std::vector<std::vector<float>> * pcmf32s;
    int progress_prev;
 };
 std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
@ -260,14 +252,6 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
    return speaker;
 }
 void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
    if (progress >= *progress_prev + progress_step) {
        *progress_prev += progress_step;
        fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
    }
 }
 void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
@ -492,25 +476,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }
 bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    const int n_segments = whisper_full_n_segments(ctx);
    // fprintf(stderr,"segments: %d\n",n_segments);
    for (int i = 0; i < n_segments; ++i) {
        const int n_tokens = whisper_full_n_tokens(ctx, i);
        // fprintf(stderr,"tokens: %d\n",n_tokens);
        for (int j = 0; j < n_tokens; j++) {
            auto token = whisper_full_get_token_text(ctx, i, j);
            auto probability = whisper_full_get_token_p(ctx, i, j);
            fout << token << '\t' << probability << std::endl;
            // fprintf(stderr,"token: %s %f\n",token,probability);
 	    }
    }
    return true;
 }
 bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    int indent = 0;
@ -918,7 +883,6 @@ int main(int argc, char ** argv) {
            wparams.split_on_word    = params.split_on_word;
            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;
            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
@ -931,7 +895,7 @@ int main(int argc, char ** argv) {
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;
-            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
+            whisper_print_user_data user_data = { &params, &pcmf32s };
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
@ -939,11 +903,6 @@ int main(int argc, char ** argv) {
                wparams.new_segment_callback_user_data = &user_data;
            }
            if (wparams.print_progress) {
                wparams.progress_callback           = whisper_print_progress_callback;
                wparams.progress_callback_user_data = &user_data;
            }
            // example for abort mechanism
            // in this example, we do not abort the processing, but we could if the flag is set to true
            // the callback is called before every encoder run - if it returns false, the processing is aborted
@ -1008,12 +967,6 @@ int main(int argc, char ** argv) {
                const auto fname_lrc = fname_out + ".lrc";
                output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
            }
            // output to score file
            if (params.log_score) {
                const auto fname_score = fname_out + ".score.txt";
                output_score(ctx, fname_score.c_str(), params, pcmf32s);
            }
        }
    }
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -138,7 +138,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
        //    return false;
        //}
-        char word[129];
+        char word[128];
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -47,7 +47,6 @@ struct whisper_params {
    bool print_special = false;
    bool no_context    = true;
    bool no_timestamps = false;
    bool tinydiarize   = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -81,8 +80,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else if (arg == "-tdrz" || arg == "--tinydiarize")  { params.tinydiarize   = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -116,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                          params.fname_out.c_str());
    fprintf(stderr, "  -tdrz,     --tinydiarize  [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
    fprintf(stderr, "\n");
 }
@ -303,8 +299,6 @@ int main(int argc, char ** argv) {
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
            // disable temperature fallback
            //wparams.temperature_inc  = -1.0f;
            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
@ -350,19 +344,10 @@ int main(int argc, char ** argv) {
                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-                        std::string output = "[" + to_timestamp(t0) + " --> " + to_timestamp(t1) + "]  " + text;
+                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
                        if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
                            output += " [SPEAKER_TURN]";
                        }
                        output += "\n";
                        printf("%s", output.c_str());
                        fflush(stdout);
                        if (params.fname_out.length() > 0) {
-                            fout << output;
+                            fout << "[" << to_timestamp(t0) << " --> " << to_timestamp(t1) << "]  " << text << std::endl;
                        }
                    }
                }
--- a/examples/whisper.android/app/build.gradle
+++ b/examples/whisper.android/app/build.gradle
@ -18,9 +18,6 @@ android {
        vectorDrawables {
            useSupportLibrary true
        }
        ndk {
            abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
        }
    }
    buildTypes {
@ -45,8 +42,8 @@ android {
    }
    ndkVersion "25.1.8937393"
    externalNativeBuild {
-        cmake {
+        ndkBuild {
-            path = file("src/main/jni/whisper/CMakeLists.txt")
+            path 'src/main/jni/whisper/Android.mk'
        }
    }
    packagingOptions {
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -0,0 +1,26 @@
 LOCAL_PATH := $(call my-dir)
 include $(CLEAR_VARS)
 LOCAL_MODULE    := libwhisper
 include $(LOCAL_PATH)/Whisper.mk
 include $(BUILD_SHARED_LIBRARY)
 ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 	include $(CLEAR_VARS)
 	LOCAL_MODULE    := libwhisper_vfpv4
 	include $(LOCAL_PATH)/Whisper.mk
 	# Allow building NEON FMA code.
 	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
 	LOCAL_CFLAGS += -mfpu=neon-vfpv4
 	include $(BUILD_SHARED_LIBRARY)
 endif
 ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
 	include $(CLEAR_VARS)
 	LOCAL_MODULE    := libwhisper_v8fp16_va
 	include $(LOCAL_PATH)/Whisper.mk
 	# Allow building NEON FMA code.
 	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
 	LOCAL_CFLAGS += -march=armv8.2-a+fp16
 	include $(BUILD_SHARED_LIBRARY)
 endif
--- a/examples/whisper.android/app/src/main/jni/whisper/Application.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Application.mk
@ -0,0 +1 @@
 APP_STL := c++_static
--- a/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
+++ b/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
@ -1,53 +0,0 @@
 cmake_minimum_required(VERSION 3.10)
 project(whisper.cpp)
 set(CMAKE_CXX_STANDARD 11)
 set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
 set(
        SOURCE_FILES
        ${WHISPER_LIB_DIR}/ggml.c
        ${WHISPER_LIB_DIR}/whisper.cpp
        ${CMAKE_SOURCE_DIR}/jni.c
 )
 find_library(LOG_LIB log)
 function(build_library target_name)
    add_library(
        ${target_name}
        SHARED
        ${SOURCE_FILES}
    )
    target_link_libraries(${target_name} ${LOG_LIB} android)
    if (${target_name} STREQUAL "whisper_v8fp16_va")
        target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
    elseif (${target_name} STREQUAL "whisper_vfpv4")
        target_compile_options(${target_name} PRIVATE -mfpu=neon-vfpv4)
    endif ()
    if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
        target_compile_options(${target_name} PRIVATE -O3)
        target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
        target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
        target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
        target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
        target_link_options(${target_name} PRIVATE -flto)
    endif ()
 endfunction()
 build_library("whisper") # Default target
 if (${ANDROID_ABI} STREQUAL "arm64-v8a")
    build_library("whisper_v8fp16_va")
 elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
    build_library("whisper_vfpv4")
 endif ()
 include_directories(${WHISPER_LIB_DIR})
--- a/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
@ -0,0 +1,18 @@
 WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
 LOCAL_LDLIBS    := -landroid -llog
 # Make the final output library smaller by only keeping the symbols referenced from the app.
 ifneq ($(APP_OPTIM),debug)
    LOCAL_CFLAGS += -O3
    LOCAL_CFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
    LOCAL_CFLAGS += -ffunction-sections -fdata-sections
    LOCAL_LDFLAGS += -Wl,--gc-sections
    LOCAL_LDFLAGS += -Wl,--exclude-libs,ALL
    LOCAL_LDFLAGS += -flto
 endif
 LOCAL_CFLAGS    += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
 LOCAL_CPPFLAGS  += -std=c++11
 LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
                   $(WHISPER_LIB_DIR)/whisper.cpp \
                   $(LOCAL_PATH)/jni.c
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -6,60 +6,9 @@
 #include <atomic>
 #include <assert.h>
 #if defined(GGML_USE_HIPBLAS)
 #include <hip/hip_runtime.h>
 #include <hipblas/hipblas.h>
 #include <hip/hip_fp16.h>
 #include <rocblas/rocblas.h>
 #define CUBLAS_OP_N HIPBLAS_OP_N
 #define CUBLAS_OP_T HIPBLAS_OP_T
 #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasCreate hipblasCreate
 #define cublasGetStatusString rocblas_status_to_string
 #define cublasHandle_t hipblasHandle_t
 #define cublasLoggerConfigure(logIsOn, logToStdOut, logToStdErr, logFileName) CUBLAS_STATUS_SUCCESS
 #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
 #define cublasSetStream hipblasSetStream
 #define cublasSgemm hipblasSgemm
 #define cublasStatus_t hipblasStatus_t
 #define cudaDeviceProp hipDeviceProp_t
 #define cudaDeviceSynchronize hipDeviceSynchronize
 #define cudaError_t hipError_t
 #define cudaEventCreateWithFlags hipEventCreateWithFlags
 #define cudaEventDestroy hipEventDestroy
 #define cudaEventDisableTiming hipEventDisableTiming
 #define cudaEventRecord hipEventRecord
 #define cudaEvent_t hipEvent_t
 #define cudaFree hipFree
 #define cudaFreeHost hipHostFree
 #define cudaGetDevice hipGetDevice
 #define cudaGetDeviceCount hipGetDeviceCount
 #define cudaGetDeviceProperties hipGetDeviceProperties
 #define cudaGetErrorString hipGetErrorString
 #define cudaGetLastError hipGetLastError
 #define cudaMalloc hipMalloc
 #define cudaMallocHost hipHostMalloc
 #define cudaMemcpy hipMemcpy
 #define cudaMemcpy2DAsync hipMemcpy2DAsync
 #define cudaMemcpyAsync hipMemcpyAsync
 #define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
 #define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamNonBlocking hipStreamNonBlocking
 #define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #else
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
 #endif
 #include "ggml-cuda.h"
 #include "ggml.h"
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@ -653,13 +653,13 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
    const int in = tid - step*im;                        // 0...15 or 0...7
-\n#if K_QUANTS_PER_ITERATION == 1\n
+#if K_QUANTS_PER_ITERATION == 1
    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
    const int is = 0;
-\n#else\n
+#else
    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
    const int is = in / 4;
-\n#endif\n
+#endif
    const int ql_offset = 64*im + l0;
    const int qh_offset = 32*im + l0;
    const int s_offset  =  8*im + is;
@ -676,7 +676,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
        const float d = vload_half(0, &x[i].d);
-\n#if K_QUANTS_PER_ITERATION == 1\n
+#if K_QUANTS_PER_ITERATION == 1
        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@ -686,7 +686,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
        tmp[16 * ix + tid] += sum;
-\n#else\n
+#else
        float sum = 0;
        for (int l = 0; l < 4; ++l) {
            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@ -695,7 +695,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
        }
        tmp[16 * ix + tid] += sum;
-\n#endif\n
+#endif
    }
--- a/ggml.c
+++ b/ggml.c
@ -292,7 +292,7 @@ typedef double ggml_float;
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <intrin.h>
 #else
-#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
+#if !defined(__riscv)
 #include <immintrin.h>
 #endif
 #endif
@ -663,7 +663,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
 }
 static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
-#ifdef __AVXVNNI__
+#if __AVXVNNI__
    const __m256i zero = _mm256_setzero_si256();
    const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
    return _mm256_cvtepi32_ps(summed_pairs);
@ -676,7 +676,7 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy)
 // multiply int8_t, add results pairwise twice and return as float vector
 static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
-#ifdef __AVXVNNIINT8__
+#if __AVXVNNIINT8__
    const __m256i zero = _mm256_setzero_si256();
    const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
    return _mm256_cvtepi32_ps(summed_pairs);
@ -692,7 +692,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
 static inline __m128i packNibbles( __m256i bytes )
 {
    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
-#ifdef __AVX512F__
+#if __AVX512F__
    const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4);   // 0000_0000_abcd_0000
    bytes = _mm256_or_si256(bytes, bytes_srli_4);               // 0000_abcd_abcd_efgh
    return _mm256_cvtepi16_epi8(bytes);                         // abcd_efgh
@ -4949,13 +4949,6 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
    return tensor;
 }
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((gnu_format(printf, 2, 3)))
 #else
 __attribute__((format(printf, 2, 3)))
 #endif
 #endif
 struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
    va_list args;
    va_start(args, fmt);
@ -18728,14 +18721,6 @@ int ggml_cpu_has_sse3(void) {
 #endif
 }
 int ggml_cpu_has_ssse3(void) {
 #if defined(__SSSE3__)
    return 1;
 #else
    return 0;
 #endif
 }
 int ggml_cpu_has_vsx(void) {
 #if defined(__POWER9_VECTOR__)
    return 1;
--- a/ggml.h
+++ b/ggml.h
@ -1508,7 +1508,6 @@ extern "C" {
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_ssse3      (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    //
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -7,7 +7,6 @@ from torch import Tensor
 from torch import nn
 from typing import Dict
 from typing import Optional
 from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
 from coremltools.models.neural_network.quantization_utils import quantize_weights
 from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
 from whisper import load_model
@ -32,12 +31,12 @@ def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
    state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
    return state_dict
-class LayerNormANE(LayerNormANEBase):
+class LayerNorm(nn.LayerNorm):
-
+    def forward(self, x: Tensor) -> Tensor:
-    def __init__(self, *args, **kwargs):
+        x = x.transpose(1,3)
-        super().__init__(*args, **kwargs)
+        x = super().forward(x)
-        self._register_load_state_dict_pre_hook(
+        x = x.transpose(1,3)
-            correct_for_bias_scale_order_inversion)
+        return x
 class MultiHeadAttentionANE(MultiHeadAttention):
    def __init__(self, n_state: int, n_head: int):
@ -104,9 +103,9 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
        super().__init__(n_state, n_head, cross_attention)
        self.attn =  MultiHeadAttentionANE(n_state, n_head)
-        self.attn_ln = LayerNormANE(n_state)
+        self.attn_ln = LayerNorm(n_state)
        self.cross_attn =  MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
-        self.cross_attn_ln =  LayerNormANE(n_state) if cross_attention else None
+        self.cross_attn_ln =  LayerNorm(n_state) if cross_attention else None
        n_mlp = n_state * 4
        self.mlp =  nn.Sequential(
@ -114,7 +113,7 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
            nn.GELU(),
            nn.Conv2d(n_mlp, n_state, kernel_size=1)
        )
-        self.mlp_ln = LayerNormANE(n_state)
+        self.mlp_ln = LayerNorm(n_state)
 class AudioEncoderANE(AudioEncoder):
@ -124,7 +123,7 @@ class AudioEncoderANE(AudioEncoder):
        self.blocks = nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
        )
-        self.ln_post = LayerNormANE(n_state)
+        self.ln_post = LayerNorm(n_state)
    def forward(self, x: Tensor):
        """
@ -168,7 +167,7 @@ class TextDecoderANE(TextDecoder):
        self.blocks= nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
        )
-        self.ln= LayerNormANE(n_state)
+        self.ln= LayerNorm(n_state)
    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
        """
--- a/models/generate-coreml-interface.sh
+++ b/models/generate-coreml-interface.sh
@ -8,7 +8,7 @@
 wd=$(dirname "$0")
 cd "$wd/../"
-python3 models/convert-whisper-to-coreml.py --model tiny.en
+python3 models/convert-whisper-to-coreml.py --model tiny.en --optimize-ane True
 mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
 xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
--- a/models/generate-coreml-model.sh
+++ b/models/generate-coreml-model.sh
@ -13,7 +13,7 @@ mname="$1"
 wd=$(dirname "$0")
 cd "$wd/../"
-python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
+python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True --optimize-ane True
 xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
 rm -rf models/ggml-${mname}-encoder.mlmodelc
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -67,7 +67,6 @@ extern "C" {
    struct whisper_context;
    struct whisper_state;
    struct whisper_full_params;
    typedef int whisper_token;
@ -346,7 +345,7 @@ extern "C" {
                              void * user_data);
    // Parameters for the whisper_full() function
-    // If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
+    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
    // whisper_full_default_params()
    struct whisper_full_params {
        enum whisper_sampling_strategy strategy;
@ -375,7 +374,6 @@ extern "C" {
        // [EXPERIMENTAL] speed-up techniques
        // note: these can significantly reduce the quality of the output
        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
        bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
        int  audio_ctx;         // overwrite the audio context size (0 = use default)
        // [EXPERIMENTAL] [TDRZ] tinydiarize
@ -519,11 +517,6 @@ extern "C" {
    WHISPER_API int          whisper_bench_ggml_mul_mat    (int n_threads);
    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
    // Control logging output; default behavior is to print to stderr
    typedef void (*whisper_log_callback)(const char * line);
    WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
 #ifdef __cplusplus
 }
 #endif