ggml : disable CUDA graphs for non-llama.cpp projects

whisper : disable CUDA mel + fix FFMPEG
sync : ggml
2025-07-04 00:11:12 +02:00 · 2024-06-26 20:14:22 +03:00 · 2024-06-26 20:11:38 +03:00 · 2024-06-26 19:40:23 +03:00 · 2024-06-26 19:34:09 +03:00 · 2024-06-18 18:10:33 +03:00
468 changed files with 259016 additions and 84415 deletions
--- a/.devops/cublas.Dockerfile
+++ b/.devops/cublas.Dockerfile
@ -21,7 +21,7 @@ COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
-ENV WHISPER_CUBLAS=1
+ENV GGML_CUDA=1
 RUN make
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -14,7 +14,7 @@ ARG CUDA_DOCKER_ARCH=all
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
-ENV WHISPER_CUBLAS=1
+ENV GGML_CUDA=1
 RUN apt-get update && \
    apt-get install -y build-essential \
@ -28,6 +28,8 @@ COPY .. .
 RUN make
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 ENV CUDA_MAIN_VERSION=12.3
 ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
 WORKDIR /app
 RUN apt-get update && \
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -15,10 +15,10 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
      - name: Build ${{ matrix.arch }}
        run: |
@ -36,7 +36,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Dependencies
        run: |
@ -53,10 +53,10 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Build
-        uses: cross-platform-actions/action@v0.15.0
+        uses: cross-platform-actions/action@v0.24.0
        with:
          operating_system: freebsd
          version: '13.2'
@ -77,10 +77,10 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
      - name: Build ${{ matrix.arch }}
        run: |
@ -101,14 +101,17 @@ jobs:
      fail-fast: false
      matrix:
        build: [Debug, Release]
-        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
+        #arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
        # TODO: arm/v7 disabled due to clang bug
        #       https://github.com/ggerganov/whisper.cpp/actions/runs/9657764109/job/26637633042?pr=2256#step:4:1990
        arch: [linux/amd64, linux/arm64, linux/ppc64le]
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
      - name: Build ${{ matrix.arch }}
        run: |
@ -133,10 +136,10 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
      - name: Build ${{ matrix.arch }}
        run: |
@ -150,6 +153,164 @@ jobs:
            make
            ctest -L gh --output-on-failure'
  ubuntu-22-cmake-sycl:
    runs-on: ubuntu-22.04
    strategy:
      fail-fast: false
      matrix:
        dwhisper_sycl: [ON]
        dcmake_c_compiler: [icx]
        dcmake_cxx_compiler: [icpx]
        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
    continue-on-error: true
    steps:
      - name: Clone
        uses: actions/checkout@v4
      - name: add oneAPI to apt
        shell: bash
        run: |
          cd /tmp
          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
      - name: install oneAPI dpcpp compiler
        shell: bash
        run: |
          sudo apt update
          sudo apt install intel-oneapi-compiler-dpcpp-cpp
      - name: install oneAPI MKL library
        shell: bash
        run: |
          sudo apt install intel-oneapi-mkl-devel
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
          cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)
  ubuntu-22-cmake-sycl-fp16:
    runs-on: ubuntu-22.04
    strategy:
      fail-fast: false
      matrix:
        dwhisper_sycl: [ON]
        dcmake_c_compiler: [icx]
        dcmake_cxx_compiler: [icpx]
        arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
    continue-on-error: true
    steps:
      - name: Clone
        uses: actions/checkout@v4
      - name: add oneAPI to apt
        shell: bash
        run: |
          cd /tmp
          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
      - name: install oneAPI dpcpp compiler
        shell: bash
        run: |
          sudo apt update
          sudo apt install intel-oneapi-compiler-dpcpp-cpp
      - name: install oneAPI MKL library
        shell: bash
        run: |
          sudo apt install intel-oneapi-mkl-devel
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
      - name: Build
        id: cmake_build
        run: |
          source /opt/intel/oneapi/setvars.sh
          mkdir build
          cd build
          cmake -DGGML_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
          cmake --build . --config Release -j $(nproc)
  windows-msys2:
    runs-on: windows-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          - { sys: UCRT64,  env: ucrt-x86_64,  build: Release }
          - { sys: CLANG64, env: clang-x86_64, build: Release }
    steps:
      - name: Clone
        uses: actions/checkout@v4
      - name: Setup ${{ matrix.sys }}
        uses: msys2/setup-msys2@v2
        with:
          update: true
          msystem: ${{matrix.sys}}
          install: >-
            base-devel
            mingw-w64-${{matrix.env}}-toolchain
            mingw-w64-${{matrix.env}}-cmake
            mingw-w64-${{matrix.env}}-SDL2
            mingw-w64-${{matrix.env}}-openblas
      - name: Build using make
        shell: msys2 {0}
        run: |
            make -j $(nproc)
      - name: Clean after building using make
        shell: msys2 {0}
        run: |
            make clean
      - name: Build using make w/ OpenBLAS
        shell: msys2 {0}
        run: |
            make GGML_OPENBLAS=1 -j $(nproc)
      - name: Build using CMake
        shell: msys2 {0}
        run: |
            cmake -B build
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
      - name: Clean after building using CMake
        shell: msys2 {0}
        run: |
            rm -rf build
      - name: Build using CMake w/ OpenBLAS
        shell: msys2 {0}
        run: |
            cmake -B build -DGGML_OPENBLAS=ON
            cmake --build build --config ${{ matrix.build }} -j $(nproc)
  windows:
    runs-on: windows-latest
@ -170,10 +331,10 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
+        uses: microsoft/setup-msbuild@v2
      - name: Fetch SDL2 and set SDL2_DIR
        if: matrix.sdl2 == 'ON'
@ -198,14 +359,14 @@ jobs:
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
      - name: Upload dll
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.jnaPath }}_whisper.dll
          path: build/bin/${{ matrix.build }}/whisper.dll
      - name: Upload binaries
        if: matrix.sdl2 == 'ON'
-        uses: actions/upload-artifact@v1
+        uses: actions/upload-artifact@v4
        with:
          name: whisper-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}
@ -223,21 +384,18 @@ jobs:
          - arch: Win32
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
            s2arc: x86
            clblast: OFF
          - arch: x64
            obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
            s2arc: x64
            clblast: ON
            clver: 1.6.1
          - sdl2: ON
            s2ver: 2.28.5
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
+        uses: microsoft/setup-msbuild@v2
      - name: Fetch OpenBLAS
        if: matrix.blas == 'ON'
@ -255,26 +413,13 @@ jobs:
          7z x sdl2.zip
          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
      - name: Install OpenCL
        if: matrix.clblast == 'ON'
        run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
      - name: Fetch CLBlast and set CLBlast_DIR
        if: matrix.clblast == 'ON'
        run: |
          C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
          7z x clblast.zip
          7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
          echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
      - name: Configure
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_OPENBLAS=${{ matrix.blas }}
+          -DGGML_OPENBLAS=${{ matrix.blas }}
          -DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
          -DWHISPER_CLBLAST=${{ matrix.clblast }}
      - name: Build
        run: |
@ -289,19 +434,15 @@ jobs:
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
      - name: Copy clblast.dll
        if: matrix.clblast == 'ON'
        run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
      - name: Upload binaries
        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
-        uses: actions/upload-artifact@v1
+        uses: actions/upload-artifact@v4
        with:
-          name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
+          name: whisper-blas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}
  windows-cublas:
-    runs-on: windows-latest
+    runs-on: windows-2019
    strategy:
      matrix:
@ -318,14 +459,14 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
+        uses: microsoft/setup-msbuild@v2
      - name: Install CUDA Toolkit
        id: cuda-toolkit
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.15
        with:
          cuda: '${{ matrix.cuda-toolkit }}'
@ -340,7 +481,7 @@ jobs:
        run: >
          cmake -S . -B ./build -A ${{ matrix.arch }}
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
-          -DWHISPER_CUBLAS=${{ matrix.cublas }}
+          -DGGML_CUDA=${{ matrix.cublas }}
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
      - name: Build ${{ matrix.cuda-toolkit }}
@ -361,7 +502,7 @@ jobs:
      - name: Upload binaries
        if: matrix.sdl2 == 'ON'
-        uses: actions/upload-artifact@v1
+        uses: actions/upload-artifact@v4
        with:
          name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}
@ -375,10 +516,10 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Setup emsdk
-        uses: mymindstorm/setup-emsdk@v12
+        uses: mymindstorm/setup-emsdk@v14
      - name: Verify
        run: emcc -v
@ -397,7 +538,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Configure
        run: |
@ -415,24 +556,24 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          path: whisper
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          repository: ggerganov/ggml
          path: ggml
      - name: Install Java
-        uses: actions/setup-java@v3
+        uses: actions/setup-java@v4
        with:
          distribution: zulu
-          java-version: 17
+          java-version: 21
      - name: Setup Android SDK
-        uses: android-actions/setup-android@v2
+        uses: android-actions/setup-android@v3
      - name: Build
        run: |
@ -450,40 +591,40 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: set up JDK 11
-        uses: actions/setup-java@v3
+        uses: actions/setup-java@v4
        with:
          java-version: '11'
          distribution: 'temurin'
          cache: gradle
      - name: Setup Android SDK
-        uses: android-actions/setup-android@v2
+        uses: android-actions/setup-android@v3
        with:
-          api-level: 30
+          cmdline-tools-version: 9.0
          build-tools-version: 30.0.3
      - name: Build
        run: |
          cd examples/whisper.android.java
-          chmod +x ./gradlew 
+          chmod +x ./gradlew
          ./gradlew assembleRelease
  java:
    needs: [ 'windows' ]
    runs-on: windows-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
      - name: Install Java
-        uses: actions/setup-java@v1
+        uses: actions/setup-java@v4
        with:
-          java-version: 17
+          distribution: zulu
          java-version: 20
      - name: Download Windows lib
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: win32-x86-64_whisper.dll
          path: bindings/java/build/generated/resources/main/win32-x86-64
@ -496,7 +637,7 @@ jobs:
          ./gradlew build
      - name: Upload jar
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: whispercpp.jar
          path: bindings/java/build/libs/whispercpp-*.jar
@ -518,7 +659,7 @@ jobs:
    steps:
      - name: Clone
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Test quantize
        run: |
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@ -37,7 +37,7 @@ jobs:
        run: npm install
      - name: Compile addon.node
-        run: npx cmake-js compile -T whisper-addon -B Release
+        run: npx cmake-js compile -T addon.node -B Release
      - name: Download test model
        run: |
--- a/.gitignore
+++ b/.gitignore
@ -6,18 +6,11 @@
 .vs/
 .vscode/
 .DS_Store
 .vimspector.json
 /CMakeSettings.json
 build/
-build-coreml/
+build-*/
 build-em/
 build-debug/
 build-release/
 build-rwdi/
 build-static/
 build-cublas/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
 # SPM
 .build/
@ -58,4 +51,4 @@ benchmark_results.csv
 cmake-build-debug/
 .cxx/
 .gradle/
-local.properties
+local.properties
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
 [submodule "bindings/ios"]
 	path = bindings/ios
 	url = https://github.com/ggerganov/whisper.spm
--- a/301
+++ b/301
@ -0,0 +1,301 @@
 # date: Tue Apr  9 20:27:03 EEST 2024
 # this file is auto-generated by scripts/gen-authors.sh
 0/0 <zero@imaskeleton.me>
 0cc4m <picard12@live.de>
 0xsourcecode <134374803+0xsourcecode@users.noreply.github.com>
 AT <manyoso@users.noreply.github.com>
 Aarni Koskela <akx@iki.fi>
 Aaron Pham <29749331+aarnphm@users.noreply.github.com>
 Aaron Taylor <aaron@exphat.com>
 Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
 Abitofevrything <54505189+abitofevrything@users.noreply.github.com>
 AfryMask <AfryMask@163.com>
 Ahmad Bilal <ahmad.bilal@empglabs.com>
 AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
 Akash Mahajan <akash7190@gmail.com>
 Akash Mahajan <akashmjn@stanford.edu>
 Al Hoang <3811822-hoanga@users.noreply.gitlab.com>
 Alan <unknown>
 Aleksander Andrzejewski <18704749+aleksanderandrzejewski@users.noreply.github.com>
 Alex Azarov <alex@azarov.by>
 Alex Bacart <13940752+alex-bacart@users.noreply.github.com>
 Alex Evgrashin <aevgrashin@yandex.ru>
 Alexandr Graschenkov <alexandr.graschenkov91@gmail.com>
 Alexandru Mariuti <alex@mariuti.com>
 Alexey Kharlamov <alexey@kharlamov.biz>
 Alfredo Montesinos <alfredo.montesinos@g.austincc.edu>
 Ali Alameh <ali.alameh@isae.edu.lb>
 Ananta Bastola <anantarajbastola@gmail.com>
 Andreu Huguet <andreuhuguet@gmail.com>
 Andrew Huynh <a5thuynh@gmail.com>
 Andrew S <andrews54757@gmail.com>
 Andy Maloney <asmaloney@gmail.com>
 Anton Kostin <masguit42@users.noreply.github.com>
 Artyom Mezin <psycho.fading@gmail.com>
 Asad Memon <asad.lionpk@gmail.com>
 Ashraful Islam <ashraful.meche@gmail.com>
 AsukaMinato <asukaminato@nyan.eu.org>
 AustinMroz <austinmroz@utexas.edu>
 Avik Sengupta <avik@sengupta.net>
 Bader-eddine Ouaich <49657842+baderouaich@users.noreply.github.com>
 Baffin Lee <baffinlee@gmail.com>
 Ben Nortier <bjnortier@gmail.com>
 Benjamin Heiniger <benjamin.heiniger@bluewin.ch>
 Bo-Yi Wu <appleboy.tw@gmail.com>
 Boris Bliznioukov <blib@mail.com>
 Borislav Stanimirov <b.stanimirov@abv.bg>
 Brad Murray <59848399+bradmurray-dt@users.noreply.github.com>
 Brian Murray <brian@bmurray.ca>
 CRD716 <crd716@gmail.com>
 Canis Lupus <Canis-UK@users.noreply.github.com>
 Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
 ChangSeok Oh <shivamidow@users.noreply.github.com>
 Chaoqun <27287694+OpenWaygate@users.noreply.github.com>
 Chia-Hsiang Cheng <88014292+garychia@users.noreply.github.com>
 Chidi Williams <williamschidi1@gmail.com>
 Christian <12550267+iceychris@users.noreply.github.com>
 Clifford Heath <clifford.heath@gmail.com>
 Colin <github@whoisc.cc>
 DGdev91 <DGdev91@users.noreply.github.com>
 Damian Czaja <trojan295@protonmail.com>
 Daniel Bevenius <daniel.bevenius@gmail.com>
 David <dnhkng@gmail.com>
 David Thorpe <djt@mutablelogic.com>
 Davidson Francis <davidsondfgl@gmail.com>
 Dener Stassun <denerstassun@gmail.com>
 Didzis Gosko <didzis@users.noreply.github.com>
 Digipom <admin@digipom.com>
 Dimo <dimo@ieee.org>
 Dody Suria Wijaya <dodysw@gmail.com>
 Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
 Duncan McConnell <ddmcconnell4@gmail.com>
 Egor Egorov <me@egorfine.com>
 Elkana Bardugo <ttv200@gmail.com>
 Emmanuel Schmidbauer <eschmidbauer@gmail.com>
 Engininja2 <139037756+Engininja2@users.noreply.github.com>
 Eric Swanson <eswanson@alloscomp.com>
 Eric Tendian <erictendian@gmail.com>
 Erik Scholz <Green-Sky@users.noreply.github.com>
 Evan Jones <evan.q.jones@gmail.com>
 Evan Martin <evan.martin@gmail.com>
 Eve <139727413+netrunnereve@users.noreply.github.com>
 Evgeny Kuznetsov <evgeny@kuznetsov.md>
 F1L1P <78918286+F1L1Pv2@users.noreply.github.com>
 Fangjun Kuang <csukuangfj@gmail.com>
 Felix <stenbackfelix@gmail.com>
 Finn Voorhees <finnvoorhees@gmail.com>
 FlippFuzz <41221030+FlippFuzz@users.noreply.github.com>
 Gang Chen <goncha@gmail.com>
 Gavin Cai <gavin1818@hotmail.com>
 George Hindle <george@georgehindle.com>
 Georgi Gerganov <ggerganov@gmail.com>
 GitAritron <103900385+GitAritron@users.noreply.github.com>
 GiviMAD <GiviMAD@users.noreply.github.com>
 Gleicon Moraes <gleicon@gmail.com>
 Gregor Jasny <gjasny@googlemail.com>
 Guillaume Wenzek <gwenzek@users.noreply.github.com>
 HY. Kelvin Lee <34256578+hykelvinlee42@users.noreply.github.com>
 Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
 Hang <bebound@gmail.com>
 Herman Semenov <GermanAizek@yandex.ru>
 Hrishikesh Barman <geekodour@users.noreply.github.com>
 Ian Bicking <ian@ianbicking.org>
 Ian Bull <irbull@eclipsesource.com>
 Ikko Ashimine <eltociear@gmail.com>
 InconsolableCellist <23345188+InconsolableCellist@users.noreply.github.com>
 Ismatulla Mansurov <47342870+sapoepsilon@users.noreply.github.com>
 Ivan Gorin <ivangorin21@gmail.com>
 JJ <103335846+computerscienceiscool@users.noreply.github.com>
 Jack Mousseau <jmousseau@users.noreply.github.com>
 JacobLinCool <jacoblincool@gmail.com>
 Jakub Ráček <blizzcz@gmail.com>
 Jared Van Bortel <jared@nomic.ai>
 Jay Binks <jaybinks@gmail.com>
 Jhen-Jie Hong <developer@jhen.me>
 Jhen-Jie Hong <iainst0409@gmail.com>
 JidongZhang-THU <1119708529@qq.com>
 Jo Liss <joliss42@gmail.com>
 Johan <jr.raffin@gmail.com>
 Johannes Gäßler <johannesg@5d6.de>
 John Balis <phobossystems@gmail.com>
 Jonathan Soo <jcsoo@agora.com>
 Jonno <1160532+razodactyl@users.noreply.github.com>
 Joonas Pihlajamaa <joonas.pihlajamaa@iki.fi>
 Jose <34888496+Jerry-Master@users.noreply.github.com>
 Josh Bleecher Snyder <josharian@gmail.com>
 Judd <foldl@users.noreply.github.com>
 Jumper775 <78500318+jumpers775@users.noreply.github.com>
 Justine Tunney <jtunney@gmail.com>
 KP Kaiser <kirk@zothcorp.com>
 Kamilake <exjang0@gmail.com>
 Kartik Saranathan <278928+Kartiku@users.noreply.github.com>
 Kasumi <90275229+kasumi-1@users.noreply.github.com>
 Kawrakow <48489457+ikawrakow@users.noreply.github.com>
 Kevin Brothaler <admin@digipom.com>
 Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
 Kreijstal <rainb@tfwno.gf>
 Kylin <56434533+KyL0N@users.noreply.github.com>
 LBlue <153975653+lbluep@users.noreply.github.com>
 Larry Battle <larry.battle.tech@gmail.com>
 Laytan Laats <laytanlaats@hotmail.com>
 Leo Moll <leo.moll@yeasoft.com>
 Lexevolution <31176843+Lexevolution@users.noreply.github.com>
 LittleLoli <26589867+WhichWho@users.noreply.github.com>
 Lucas Zanek <57494138+LucasZNK@users.noreply.github.com>
 Luis Herrera <herrera-luis@users.noreply.github.com>
 Lukas Rist <glaslos@gmail.com>
 M. A. Ali <73258591+MightyStud@users.noreply.github.com>
 M. Eren Akbiyik <erenakbiyik@gmail.com>
 Maciek <maciek.mab122@gmail.com>
 Marcin Mielniczuk <marmistrz.dev@zoho.eu>
 Martin Warnaar <martinwarnaar@gmail.com>
 Matheus de Sousa <23645013+keyehzy@users.noreply.github.com>
 Mathijs de Bruin <mathijs@mathijsfietst.nl>
 Matija Pevec <mightymatth@users.noreply.github.com>
 Maximiliano Levi <8160966+maxilevi@users.noreply.github.com>
 Meng, Hengyu <hengyu.meng@intel.com>
 Michael Podvitskiy <podvitskiymichael@gmail.com>
 Michael Rienstra <mrienstra@gmail.com>
 Mikhail Grigorev <sleuthhound@gmail.com>
 Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
 Mohit Agarwal <mohit@sdf.org>
 Murilo Santana <mvrilo@gmail.com>
 Neil Chudleigh <nchudleigh@users.noreply.github.com>
 Neo Zhang Jianyu <jianyu.zhang@intel.com>
 Neuman Vong <neuman.vong@gmail.com>
 Nicholas Albion <nalbion@yahoo.com>
 Niels Mayer <Niels.Mayer@gmail.com>
 Okabintaro <103938900+Okabintaro@users.noreply.github.com>
 Oleg Sidorov <me@whitebox.io>
 Oleg Sidorov <oleg@sidorov.nl>
 Ondrej Kokes <ondrej.kokes@gmail.com>
 Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
 Paul Tsochantaris <ptsochantaris@icloud.com>
 Philipp Zabel <philipp.zabel@gmail.com>
 Philippe Normand <phil@base-art.net>
 Przemysław Pawełczyk <przemoc@gmail.com>
 Qianhe Chen <54462604+chenqianhe@users.noreply.github.com>
 Radosław Gryta <radek.gryta@gmail.com>
 Reinforce-II <fate@eastal.com>
 Reinis Muiznieks <muiznieks.reinis@gmail.com>
 RelatedTitle <r3latedtitle@gmail.com>
 RhinoDevel <RhinoDevel@users.noreply.github.com>
 Rich Jones <miserlou@gmail.com>
 Robin <robin.xw@hotmail.com>
 Roddur Dasgupta <roddurd@gmail.com>
 Roland Rabien <figbug@gmail.com>
 Rotem Dan <rotemdan@gmail.com>
 Ryan Hitchman <hitchmanr@gmail.com>
 Ryan Metcalfe <107415876+RyanMetcalfeInt8@users.noreply.github.com>
 RyanChang <ftes90015@gmail.com>
 Sam <49637763+Onlyartist9@users.noreply.github.com>
 Sam Pullara <spullara@gmail.com>
 Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
 Sergio López <slp@sinrega.org>
 Siddharth Ramakrishnan <srr2141@columbia.edu>
 Simon Moisselin <simon.moisstoll@gmail.com>
 Sindre Sorhus <sindresorhus@gmail.com>
 Slava Primenko <primenko.s@gmail.com>
 Syahmi Azhar <prsyahmi@gmail.com>
 Syed Jafri <syedjafri97@gmail.com>
 Sơn Phan Trung <phantrungson17@gmail.com>
 Taisei Mima <bhbstar.me@gmail.com>
 Takeshi Inoue <inoue.takeshi@gmail.com>
 Tamotsu Takahashi <ttakah+github@gmail.com>
 Taras Glek <taras@thegp.com>
 Tauseef Mohiuddin <35351464+tauseefmohammed2@users.noreply.github.com>
 Thijs Raymakers <thijs@raymakers.nl>
 Thomas Fitzsimmons <fitzsim@fitzsim.org>
 Tiago Fassoni <tiagofassoni@users.noreply.github.com>
 Tienshiao Ma <tienshiao@tienshiao.org>
 Timothy Cronin <40186632+4imothy@users.noreply.github.com>
 Tobrun <tobrun.van.nuland@gmail.com>
 Todd <taf2@users.noreply.github.com>
 Tong Li <31761981+litongjava@users.noreply.github.com>
 Topping1 <78745143+Topping1@users.noreply.github.com>
 Travis Cline <travis.cline@gmail.com>
 UEXTM.com <84163508+uextm@users.noreply.github.com>
 Vadim Peretokin <vperetokin@hey.com>
 Valentin Gosu <1454649+valenting@users.noreply.github.com>
 Vulcan <93451215+trholding@users.noreply.github.com>
 WhiteOlivierus <36532695+WhiteOlivierus@users.noreply.github.com>
 Xiang (Kevin) Li <kevinli020508@gmail.com>
 Xiao-Yong Jin <jinxiaoyong@gmail.com>
 XiaotaoChen <chenxiaotao1234@gmail.com>
 Yajing Tang <phillis@google.com>
 Yang Shen <aplshenyang@gmail.com>
 Yunès <jean.baptiste.yunes@free.fr>
 ZaBlazzingZephyrus <119159668+blazingzephyr@users.noreply.github.com>
 Zigfrid Zvezdin <ziggerZZ@gmail.com>
 Zollner <24618122+Zolliner@users.noreply.github.com>
 ai-at-home <149282006+ai-at-home@users.noreply.github.com>
 alonfaraj <alonfaraj@gmail.com>
 andypayne <apayne@gmail.com>
 ardfork <134447697+ardfork@users.noreply.github.com>
 automaticcat <daogiatuank54@gmail.com>
 be-next <jerome.ramette@gmail.com>
 bert hubert <bert@hubertnet.nl>
 bmwl <brian.marshall@tolko.com>
 bobqianic <129547291+bobqianic@users.noreply.github.com>
 bocytko <bocytko+github@gmail.com>
 boolemancer <48014766+boolemancer@users.noreply.github.com>
 boolemancer <boolemancer@gmail.com>
 bradmit <151883577+bradmit@users.noreply.github.com>
 brunofaustino <b.fa.amorim@gmail.com>
 bssrdf <merlintiger@hotmail.com>
 byte-6174 <88070277+byte-6174@users.noreply.github.com>
 cdosoftei <ciprian.dosoftei@gmail.com>
 clach04 <Chris.Clark@actian.com>
 compilade <113953597+compilade@users.noreply.github.com>
 conradg <conradjgodfrey@gmail.com>
 ddpasa <112642920+ddpasa@users.noreply.github.com>
 denersc <denerstassun@gmail.com>
 dscripka <dscripka@users.noreply.github.com>
 duthils <duthils@duthils.net>
 ecneladis <ecneladis@users.noreply.github.com>
 faker <nspyia2002@gmail.com>
 fitzsim <fitzsim@fitzsim.org>
 fraxy-v <65565042+fraxy-v@users.noreply.github.com>
 genevera (she/her) <genevera@users.noreply.github.com>
 geniusnut <geniusnut@gmail.com>
 greeshmay <greeshmay@gmail.com>
 hydai <z54981220@gmail.com>
 iamthad <thadeus.j.fleming@gmail.com>
 james wolf <contractorwolf@hotmail.com>
 joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
 jorismertz <35079666+jorismertz@users.noreply.github.com>
 junkfood <69683722+JunkFood02@users.noreply.github.com>
 jwijffels <jwijffels@bnosac.be>
 kamranjon <kamranjon@gmail.com>
 katsu560 <katsu560oo-@docomo.ne.jp>
 kennethge <57784063+kenneth-ge@users.noreply.github.com>
 keyehzy <msamuel@aluno.puc-rio.br>
 leejet <leejet714@gmail.com>
 litong <31761981+litongjava@users.noreply.github.com>
 lnyan <lkwq007@gmail.com>
 m.bell <m.bell@techsmith.com>
 mkiol <mkiol@users.noreply.github.com>
 novag <7754358+novag@users.noreply.github.com>
 pajowu <pajowu@pajowu.de>
 polarmoon <90010972+polarmoon@users.noreply.github.com>
 rlapray <lapray.romain@gmail.com>
 sandrohanea <40202887+sandrohanea@users.noreply.github.com>
 semiformal-net <84111142+semiformal-net@users.noreply.github.com>
 shibukazu <61775791+shibukazu@users.noreply.github.com>
 shikokuchuo <53399081+shikokuchuo@users.noreply.github.com>
 slaren <slarengh@gmail.com>
 slashlib <slashlib@users.noreply.github.com>
 snadampal <87143774+snadampal@users.noreply.github.com>
 st-gr <38470677+st-gr@users.noreply.github.com>
 texmex76 <40733439+texmex76@users.noreply.github.com>
 thefinaldegree <thefinaldegree@gmail.com>
 trixirt <trix@redhat.com>
 ulatekh <ulatekh@yahoo.com>
 undef <undefdev@gmail.com>
 venkr <venkateshrameshkumar+1@gmail.com>
 vicalloy <zbirder@gmail.com>
 xdrudis <xavierdrudis@yahoo.es>
 zhouwg <6889919+zhouwg@users.noreply.github.com>
 布客飞龙 <562826179@qq.com>
 Артём Земляк <azemlyak@smart-consulting.ru>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,21 +1,31 @@
-cmake_minimum_required (VERSION 3.5)
+cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
 project("whisper.cpp" C CXX)
 project("whisper.cpp" VERSION 1.6.2)
 include(CheckIncludeFileCXX)
-project(whisper.cpp VERSION 1.5.4)
+set(SOVERSION 1)
 #set(CMAKE_WARN_DEPRECATED YES)
 set(CMAKE_WARN_UNUSED_CLI YES)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(WHISPER_STANDALONE ON)
-    include(GitVars)
+
-    include(BuildTypes)
+    include(git-vars)
    # configure project version
    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
        configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
    endif()
    configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY)
 else()
    set(WHISPER_STANDALONE OFF)
@ -25,6 +35,11 @@ if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)
    option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
    # TODO: without these, we get the following error:
    #       wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -33,570 +48,145 @@ else()
    endif()
 endif()
-# options
+option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
-if (APPLE)
+#
-    set(WHISPER_METAL_DEFAULT ON)
+# option list
-else()
+#
    set(WHISPER_METAL_DEFAULT OFF)
 endif()
-option(BUILD_SHARED_LIBS              "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
+# general
 option(WHISPER_CCACHE "whisper: use ccache if available" ON)
 # debug
 option(WHISPER_ALL_WARNINGS           "whisper: enable all compiler warnings"                   ON)
 option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
-option(WHISPER_SANITIZE_THREAD        "whisper: enable thread sanitizer"    OFF)
+# build
-option(WHISPER_SANITIZE_ADDRESS       "whisper: enable address sanitizer"   OFF)
+option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
 option(WHISPER_SANITIZE_UNDEFINED     "whisper: enable undefined sanitizer" OFF)
 option(WHISPER_BUILD_TESTS            "whisper: build tests"    ${WHISPER_STANDALONE})
 option(WHISPER_BUILD_EXAMPLES         "whisper: build examples" ${WHISPER_STANDALONE})
 option(WHISPER_SDL2                   "whisper: support for libSDL2" OFF)
 option(WHISPER_NO_AVX                 "whisper: disable AVX"  OFF)
 option(WHISPER_NO_AVX2                "whisper: disable AVX2" OFF)
 option(WHISPER_NO_FMA                 "whisper: disable FMA"  OFF)
 option(WHISPER_NO_F16C                "whisper: disable F16c" OFF)
 option(WHISPER_OPENVINO               "whisper: support for OpenVINO" OFF)
 if (APPLE)
    option(WHISPER_NO_ACCELERATE         "whisper: disable Accelerate framework" OFF)
    option(WHISPER_METAL                 "whisper: use Metal"                    ${WHISPER_METAL_DEFAULT})
    option(WHISPER_METAL_NDEBUG          "whisper: disable Metal debugging"      OFF)
    option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
    option(WHISPER_METAL_EMBED_LIBRARY   "whisper: embed Metal library"          OFF)
 else()
    option(WHISPER_BLAS                  "whisper: use BLAS libraries"  OFF)
    option(WHISPER_BLAS_VENDOR           "whisper: BLAS library vendor" Generic)
    option(WHISPER_OPENBLAS              "whisper: prefer OpenBLAS"     OFF)
    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"  OFF)
    option(WHISPER_HIPBLAS               "whisper: support for hipBLAS" OFF)
    option(WHISPER_CLBLAST               "whisper: use CLBlast"         OFF)
 endif()
 option(WHISPER_PERF "whisper: enable perf timings" OFF)
 # sanitizers
 option(WHISPER_SANITIZE_THREAD    "whisper: enable thread sanitizer"    OFF)
 option(WHISPER_SANITIZE_ADDRESS   "whisper: enable address sanitizer"   OFF)
 option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
-if (NOT MSVC)
+# extra artifacts
-    if (WHISPER_SANITIZE_THREAD)
+option(WHISPER_BUILD_TESTS    "whisper: build tests"          ${WHISPER_STANDALONE})
-        set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -fsanitize=thread")
+option(WHISPER_BUILD_EXAMPLES "whisper: build examples"       ${WHISPER_STANDALONE})
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
+option(WHISPER_BUILD_SERVER   "whisper: build server example" ${WHISPER_STANDALONE})
    endif()
-    if (WHISPER_SANITIZE_ADDRESS)
+# 3rd party libs
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
+option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
+option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
    endif()
    if (WHISPER_SANITIZE_UNDEFINED)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
    endif()
 endif()
 #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
 #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
 # dependencies
 find_package(Threads REQUIRED)
 # on APPLE
 if (APPLE)
    # include Accelerate framework
    if (NOT WHISPER_NO_ACCELERATE)
        find_library(ACCELERATE_FRAMEWORK Accelerate)
        if (ACCELERATE_FRAMEWORK)
            message(STATUS "Accelerate framework found")
            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
        else()
            message(FATAL_ERROR "Accelerate framework not found")
        endif()
    endif()
    if (WHISPER_METAL)
        find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
        find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
        find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
        if (METAL_FRAMEWORK)
            message(STATUS "Metal framework found")
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS}
                ${FOUNDATION_LIBRARY}
                ${METAL_FRAMEWORK}
                ${METALKIT_FRAMEWORK}
                )
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_METAL)
            if (WHISPER_METAL_NDEBUG)
                set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
            endif()
        else()
            message(FATAL_ERROR "Metal framework not found")
        endif()
        set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
        # copy ggml-metal.metal to bin directory
        configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
        if (WHISPER_METAL_EMBED_LIBRARY)
            enable_language(ASM)
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_EMBED_LIBRARY)
            set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
            file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
            set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
            add_custom_command(
                OUTPUT ${EMBED_METALLIB_ASSEMBLY}
                COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
                COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
                COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
                COMMAND echo ".incbin \\\"${METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
                COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
                COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
                DEPENDS ${METALLIB_SOURCE}
                COMMENT "Generate assembly for embedded Metal library"
            )
            set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
        endif()
    endif()
    if (WHISPER_COREML)
        find_library(FOUNDATION_FRAMEWORK Foundation)
        find_library(COREML_FRAMEWORK CoreML)
        if (COREML_FRAMEWORK)
            message(STATUS "CoreML framework found")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
        else()
            message(FATAL_ERROR "CoreML framework not found")
        endif()
        if (WHISPER_COREML_ALLOW_FALLBACK)
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_COREML_ALLOW_FALLBACK)
        endif()
    endif()
 endif()
 if (WHISPER_OPENBLAS)
    set(WHISPER_BLAS_VENDOR "OpenBLAS")
    set(WHISPER_BLAS ON)
 endif()
 if (WHISPER_BLAS)
    if (WIN32)
        if(DEFINED ENV{OPENBLAS_PATH})
            set(BLAS_LIBRARIES $ENV{OPENBLAS_PATH}/lib/libopenblas.dll.a)
            message(STATUS "Libraries ${BLAS_LIBRARIES}")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
            include_directories($ENV{OPENBLAS_PATH}/include)
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
        else ()
            message(FATAL_ERROR "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
        endif ()
    else ()
        set(BLA_STATIC 1)
        set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
        set(BLA_SIZEOF_INTEGER 8)
        set(BLA_PREFER_PKGCONFIG 1)
        find_package(BLAS)
        if(BLAS_FOUND)
            message(STATUS "BLAS compatible library found")
            message(STATUS "Libraries ${BLAS_LIBRARIES}")
            find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas /usr/local/include/openblas $ENV{BLAS_HOME}/include)
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
            include_directories(${BLAS_INCLUDE_DIRS})
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
        else()
            message(FATAL_ERROR "BLAS library was not found")
        endif()
    endif ()
 endif ()
 if (WHISPER_CUBLAS)
    cmake_minimum_required(VERSION 3.17)
    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
        message(STATUS "cuBLAS found")
        enable_language(CUDA)
        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
        add_compile_definitions(GGML_USE_CUBLAS)
        if (WHISPER_STATIC)
            if (WIN32)
                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
            else ()
                set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
            endif()
        else()
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()
        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
    else()
        message(FATAL_ERROR "cuBLAS not found")
    endif()
 endif()
 if (WHISPER_HIPBLAS)
    list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
    endif()
    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
        message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
    endif()
    find_package(hip)
    find_package(hipblas)
    find_package(rocblas)
    if (${hipblas_FOUND} AND ${hip_FOUND})
        message(STATUS "HIP and hipBLAS found")
        add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
        add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
        set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
        set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
        target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
        if (WHISPER_STATIC)
            message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
        endif()
        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ggml-rocm)
    else()
        message(FATAL_ERROR "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
    endif()
 endif()
 if (WHISPER_CLBLAST)
    find_package(CLBlast)
    if (CLBlast_FOUND)
        message(STATUS "CLBlast found")
        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
        add_compile_definitions(GGML_USE_CLBLAST)
        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
    else()
        message(FATAL_ERROR "CLBlast not found")
    endif()
 endif()
 if( WHISPER_OPENVINO )
    find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()
 # compiler flags
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
 endif ()
 if (WHISPER_ALL_WARNINGS)
    if (NOT MSVC)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
            -Wall                           \
            -Wextra                         \
            -Wpedantic                      \
            -Wshadow                        \
            -Wcast-qual                     \
            -Wstrict-prototypes             \
            -Wpointer-arith                 \
            -Wno-unused-function            \
        ")
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
            -Wall                           \
            -Wextra                         \
            -Wpedantic                      \
            -Wcast-qual                     \
        ")
    else()
        # todo : msvc
    endif()
 endif()
 if (NOT MSVC)
    # TODO: temporary disabled until we figure out ggml-metal.m
    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
 endif()
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
 if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
    message(STATUS "ARM detected")
 elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
    message(STATUS "PowerPC detected")
 else()
    message(STATUS "x86 detected")
    if (MSVC)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
        if(NOT WHISPER_NO_AVX2)
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
        else()
            if(NOT WHISPER_NO_AVX)
                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
            endif()
        endif()
    else()
        if (EMSCRIPTEN)
            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -s TOTAL_STACK=5242880")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
        else()
            if(NOT WHISPER_NO_AVX)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
            endif()
            if(NOT WHISPER_NO_AVX2)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
            endif()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
            if(NOT WHISPER_NO_F16C)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
            endif()
        endif()
    endif()
 endif()
 #
 # POSIX conformance
 #
 # clock_gettime came in POSIX.1b (1993)
 # CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
 # posix_memalign came in POSIX.1-2001 / SUSv3
 # M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
 add_compile_definitions(_XOPEN_SOURCE=600)
 # Somehow in OpenBSD whenever POSIX conformance is specified
 # some string functions rely on locale_t availability,
 # which was introduced in POSIX.1-2008, forcing us to go higher
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
    remove_definitions(-D_XOPEN_SOURCE=600)
    add_compile_definitions(_XOPEN_SOURCE=700)
 endif()
 # Data types, macros and functions related to controlling CPU affinity
 # are available on Linux through GNU extensions in libc
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
-    add_compile_definitions(_GNU_SOURCE)
+    option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
 endif()
-# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
+option(WHISPER_COREML                "whisper: enable Core ML framework"  OFF)
-# and on macOS its availability depends on enabling Darwin extensions
+option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
-# similarly on DragonFly, enabling BSD extensions is necessary
+option(WHISPER_OPENVINO              "whisper: support for OpenVINO"      OFF)
 if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
    add_compile_definitions(_DARWIN_C_SOURCE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
    add_compile_definitions(_DARWIN_C_SOURCE)
 endif()
-# alloca is a non-standard interface that is not visible on BSDs when
+# Required for relocatable CMake package
-# POSIX conformance is specified, but not all of them provide a clean way
+include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
 # to enable it in such cases
 if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
    add_compile_definitions(__BSD_VISIBLE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
    add_compile_definitions(_NETBSD_SOURCE)
 endif()
 if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
    add_compile_definitions(_BSD_SOURCE)
 endif()
-if (WHISPER_PERF)
+# override ggml options
-    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
+set(GGML_CCACHE             ${WHISPER_CCACHE})
-endif()
+set(GGML_SANITIZE_THREAD    ${WHISPER_SANITIZE_THREAD})
 set(GGML_SANITIZE_ADDRESS   ${WHISPER_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
 set(GGML_ALL_WARNINGS       ${WHISPER_ALL_WARNINGS})
 set(GGML_FATAL_WARNINGS     ${WHISPER_FATAL_WARNINGS})
-#
+# transition helpers
-# whisper.coreml - Core ML support
+function (whisper_option_depr TYPE OLD NEW)
-#
+    if (${OLD})
-
+        message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
-if (WHISPER_COREML)
+        set(${NEW} ON)
    set(TARGET whisper.coreml)
    add_library(${TARGET}
        coreml/whisper-encoder.h
        coreml/whisper-encoder.mm
        coreml/whisper-encoder-impl.h
        coreml/whisper-encoder-impl.m
        )
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PUBLIC
        .
        )
    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
    set_target_properties(${TARGET} PROPERTIES
        COMPILE_FLAGS "-fobjc-arc"
        )
 endif()
 if (WHISPER_OPENVINO)
    set(TARGET whisper.openvino)
    add_library(${TARGET} OBJECT
        openvino/whisper-openvino-encoder.h
        openvino/whisper-openvino-encoder.cpp
        )
    target_include_directories(${TARGET} PUBLIC
        .
        )
    set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO)
    target_link_libraries(${TARGET} PRIVATE openvino::runtime)
 endif()
 #
 # whisper - this is the main library of the project
 #
 set(TARGET whisper)
 add_library(${TARGET}
    ggml.h
    ggml.c
    ggml-alloc.h
    ggml-alloc.c
    ggml-backend.h
    ggml-backend.c
    ggml-quants.h
    ggml-quants.c
    ${GGML_SOURCES_METAL}
    ${GGML_SOURCES_CUDA}
    ${GGML_SOURCES_OPENCL}
    whisper.h
    whisper.cpp
    )
 include(DefaultTargetOptions)
 target_include_directories(${TARGET} PUBLIC
    .
    )
 if (WHISPER_COREML)
    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
 endif()
 if (WHISPER_OPENVINO)
    target_link_libraries(${TARGET} PRIVATE whisper.openvino)
 endif()
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
 else()
    target_link_libraries(${TARGET} PRIVATE m ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
 endif()
 if (BUILD_SHARED_LIBS)
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_link_libraries(${TARGET} PUBLIC
        ${CMAKE_DL_LIBS}
        )
    target_compile_definitions(${TARGET} PUBLIC
        WHISPER_SHARED
        GGML_SHARED
        )
    target_compile_definitions(${TARGET} PRIVATE
        WHISPER_BUILD
        GGML_BUILD
        )
    if (WHISPER_METAL)
        # TODO: I think this should make ggml-metal.m "see" the ggml-metal.metal file from the "bin" directory
        #       but for some reason it does not work here like it does in llama.cpp
        set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
    endif()
-endif()
+endfunction()
-if (GGML_SOURCES_CUDA)
+whisper_option_depr(FATAL_ERROR WHISPER_CUBLAS              GGML_CUDA)
-    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
+whisper_option_depr(WARNING     WHISPER_CUDA                GGML_CUDA)
-    # Only configure gmml CUDA architectures is not globally set
+whisper_option_depr(WARNING     WHISPER_KOMPUTE             GGML_KOMPUTE)
-    if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
+whisper_option_depr(WARNING     WHISPER_METAL               GGML_METAL)
-        # Not overriden by user, so set defaults
+whisper_option_depr(WARNING     WHISPER_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
-        set(GGML_CUDA_ARCHITECTURES 52 61 70)
+whisper_option_depr(WARNING     WHISPER_NATIVE              GGML_NATIVE)
-    endif()
+whisper_option_depr(WARNING     WHISPER_OPENMP              GGML_OPENMP)
-    message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
+whisper_option_depr(WARNING     WHISPER_RPC                 GGML_RPC)
-    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
+whisper_option_depr(WARNING     WHISPER_SYCL                GGML_SYCL)
-    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
+whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
 endif()
-if (EMSCRIPTEN)
+#
-    set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
+# build the library
-endif()
+#
-target_compile_definitions(${TARGET} PUBLIC
+add_subdirectory(ggml)
-    ${WHISPER_EXTRA_FLAGS}
+add_subdirectory(src)
    )
-set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "ggml.h;whisper.h")
+#
 # install
 #
 include(GNUInstallDirs)
 include(CMakePackageConfigHelpers)
-install(TARGETS ${TARGET}
+set(WHISPER_BUILD_NUMBER        ${BUILD_NUMBER})
-    LIBRARY  DESTINATION lib
+set(WHISPER_BUILD_COMMIT        ${BUILD_COMMIT})
-    ARCHIVE  DESTINATION lib/static
+set(WHISPER_INSTALL_VERSION     ${CMAKE_PROJECT_VERSION})
    RUNTIME  DESTINATION bin
    RESOURCE DESTINATION bin
    PUBLIC_HEADER DESTINATION include
    )
-#
+set(WHISPER_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
-# bindings
+set(WHISPER_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
-#
+set(WHISPER_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")
-add_subdirectory(bindings)
+get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
 set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
 install(TARGETS whisper LIBRARY PUBLIC_HEADER)
 configure_package_config_file(
        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
        ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper
    PATH_VARS
    WHISPER_INCLUDE_INSTALL_DIR
    WHISPER_LIB_INSTALL_DIR
    WHISPER_BIN_INSTALL_DIR )
 write_basic_package_version_file(
    ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
    VERSION ${WHISPER_INSTALL_VERSION}
    COMPATIBILITY SameMajorVersion)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
 install(
    FILES convert-hf-to-gguf.py
    PERMISSIONS
        OWNER_READ
        OWNER_WRITE
        OWNER_EXECUTE
        GROUP_READ
        GROUP_EXECUTE
        WORLD_READ
        WORLD_EXECUTE
    DESTINATION ${CMAKE_INSTALL_BINDIR})
 configure_file(cmake/whisper.pc.in
        "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
        @ONLY)
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
        DESTINATION lib/pkgconfig)
 #
 # programs, examples and tests
 #
 if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
-    enable_testing()
+    #include(CTest)
-    add_subdirectory(tests)
+    #add_subdirectory(tests)
 endif ()
 if (WHISPER_BUILD_EXAMPLES)
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2023 Georgi Gerganov
+Copyright (c) 2023-2024 The ggml authors
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/1198
+++ b/1198
--- a/Package.swift
+++ b/Package.swift
@ -13,13 +13,9 @@ let package = Package(
    products: [
        .library(name: "whisper", targets: ["whisper"]),
    ],
    dependencies: [
        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
    ],
    targets: [
        .target(
            name: "whisper",
            dependencies: ["ggml"],
            path: ".",
            exclude: [
               "bindings",
@ -31,13 +27,17 @@ let package = Package(
               "samples",
               "tests",
               "CMakeLists.txt",
               "ggml-cuda.cu",
               "ggml-cuda.h",
               "Makefile"
            ],
            sources: [
-                "whisper.cpp",
+                "ggml/src/ggml.c",
                "src/whisper.cpp",
                "ggml/src/ggml-alloc.c",
                "ggml/src/ggml-backend.c",
                "ggml/src/ggml-quants.c",
                "ggml/src/ggml-metal.m"
            ],
            resources: [.process("ggml-metal.metal")],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
--- a/README.md
+++ b/README.md
@ -4,9 +4,10 @@
 [![Actions Status](https://github.com/ggerganov/whisper.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/whisper.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Stable: [v1.5.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.6.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.6.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -19,7 +20,6 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - Zero memory allocations at runtime
 - Support for CPU-only inference
 - [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
 - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
@ -278,6 +278,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
  - To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
  - Python 3.10 is recommended.
  - MacOS Sonoma (version 14) or newer is recommended, as older versions of MacOS might experience issues with transcription hallucination.
  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
    - To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
    - To activate the environment, use: `conda activate py310-whisper`
@ -339,7 +340,7 @@ This can result in significant speedup in encoder performance. Here are the inst
  python -m venv openvino_conv_env
  openvino_conv_env\Scripts\activate
  python -m pip install --upgrade pip
-  pip install -r openvino-conversion-requirements.txt
+  pip install -r requirements-openvino.txt
  ```
  Linux and macOS:
@ -349,7 +350,7 @@ This can result in significant speedup in encoder performance. Here are the inst
  python3 -m venv openvino_conv_env
  source openvino_conv_env/bin/activate
  python -m pip install --upgrade pip
-  pip install -r openvino-conversion-requirements.txt
+  pip install -r requirements-openvino.txt
  ```
 - Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
@ -413,35 +414,13 @@ For more information about the Core ML implementation please refer to PR [#1037]
 With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
 First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
-Now build `whisper.cpp` with cuBLAS support:
+Now build `whisper.cpp` with CUDA support:
 ```
 make clean
-WHISPER_CUBLAS=1 make -j
+GGML_CUDA=1 make -j
 ```
 ## OpenCL GPU support via CLBlast
 For cards and integrated GPUs that support OpenCL, the Encoder processing can be largely offloaded to the GPU through CLBlast. This is especially useful for users with AMD APUs or low end devices for up to ~2x speedup.
 First, make sure you have installed `CLBlast` for your OS or Distribution: https://github.com/CNugteren/CLBlast
 Now build `whisper.cpp` with CLBlast support:
 ```
 Makefile:
 cd whisper.cpp
 make clean
 WHISPER_CLBLAST=1 make -j
 CMake:
 cd whisper.cpp
 cmake -B build -DWHISPER_CLBLAST=ON
 cmake --build build -j --config Release
 ```
 Run all the examples as usual.
 ## BLAS CPU support via OpenBLAS
 Encoder processing can be accelerated on the CPU via OpenBLAS.
@ -451,7 +430,22 @@ Now build `whisper.cpp` with OpenBLAS support:
 ```
 make clean
-WHISPER_OPENBLAS=1 make -j
+GGML_OPENBLAS=1 make -j
 ```
 ## BLAS CPU support via Intel MKL
 Encoder processing can be accelerated on the CPU via the BLAS compatible interface of Intel's Math Kernel Library.
 First, make sure you have installed Intel's MKL runtime and development packages: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html
 Now build `whisper.cpp` with Intel MKL BLAS support:
 ```
 source /opt/intel/oneapi/setvars.sh
 mkdir build
 cd build
 cmake -DWHISPER_MKL=ON ..
 WHISPER_MKL=1 make -j
 ```
 ## Docker
@ -486,6 +480,16 @@ docker run -it --rm \
  whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
 ```
 ## Installing with Conan
 You can install pre-built binaries for whisper.cpp or build it from source using [Conan](https://conan.io/). Use the following command:
 ```
 conan install --requires="whisper-cpp/[*]" --build=missing
 ```
 For detailed instructions on how to use Conan, please refer to the [Conan documentation](https://docs.conan.io/2/).
 ## Limitations
 - Inference only
@ -694,7 +698,7 @@ The [main](examples/main) example provides support for output of karaoke-style m
 currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
 This requires to have `ffmpeg` installed.
-Here are a few *"typical"* examples:
+Here are a few _"typical"_ examples:
 ```bash
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
@ -728,10 +732,10 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
 ## Video comparison of different models
-Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
+Use the [scripts/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
 ```bash
-./extra/bench-wts.sh samples/jfk.wav
+./scripts/bench-wts.sh samples/jfk.wav
 ffplay ./samples/jfk.wav.all.mp4
 ```
@ -752,7 +756,7 @@ Additionally a script to run whisper.cpp with different models and audio files i
 You can run it with the following command, by default it will run against any standard model in the models folder.
 ```bash
-python3 extra/bench.py -f samples/jfk.wav -t 2,4,8 -p 1,2
+python3 scripts/bench.py -f samples/jfk.wav -t 2,4,8 -p 1,2
 ```
 It is written in python with the intention of being easy to modify and extend for your benchmarking use case.
@ -792,6 +796,7 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
 - [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
 - [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 - [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
--- a/README_sycl.md
+++ b/README_sycl.md
@ -0,0 +1,249 @@
 # whisper.cpp for SYCL
 [Background](#background)
 [OS](#os)
 [Intel GPU](#intel-gpu)
 [Linux](#linux)
 [Environment Variable](#environment-variable)
 [Known Issue](#known-issue)
 [Todo](#todo)
 ## Background
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators<72>such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
 oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
 Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
 To avoid  re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel<EFBFBD> DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
 The whisper.cpp for SYCL is used to support Intel GPUs.
 For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
 ## OS
 |OS|Status|Verified|
 |-|-|-|
 |Linux|Support|Ubuntu 22.04|
 |Windows|Ongoing| |
 ## Intel GPU
 |Intel GPU| Status | Verified Model|
 |-|-|-|
 |Intel Data Center Max Series| Support| Max 1550|
 |Intel Data Center Flex Series| Support| Flex 170|
 |Intel Arc Series| Support| Arc 770|
 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
 |Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
 ## Linux
 ### Setup Environment
 1. Install Intel GPU driver.
 a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
 Note: for iGPU, please install the client GPU driver.
 b. Add user to group: video, render.
 ```
 sudo usermod -aG render username
 sudo usermod -aG video username
 ```
 Note: re-login to enable it.
 c. Check
 ```
 sudo apt install clinfo
 sudo clinfo -l
 ```
 Output (example):
 ```
 Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics
 Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```
 2. Install Intel<65> oneAPI Base toolkit.
 a. Please follow the procedure in [Get the Intel<65> oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 Recommend to install to default folder: **/opt/intel/oneapi**.
 Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
 b. Check
 ```
 source /opt/intel/oneapi/setvars.sh
 sycl-ls
 ```
 There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
 Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 ```
 2. Build locally:
 ```
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
 #for FP16
 #cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON 
 #for FP32
 cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main only
 #cmake --build . --config Release --target main
 #build all binary
 cmake --build . --config Release -v
 ```
 or
 ```
 ./examples/sycl/build.sh
 ```
 Note:
 - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
 ### Run
 1. Put model file to folder **models**
 2. Enable oneAPI running environment
 ```
 source /opt/intel/oneapi/setvars.sh
 ```
 3. List device ID
 Run without parameter:
 ```
 ./build/bin/ls-sycl-device
 or
 ./build/bin/main
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
 4. Set device ID and execute whisper.cpp
 Set device ID = 0 by **GGML_SYCL_DEVICE=0**
 ```
 GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
 ```
 or run by script:
 ```
 ./examples/sycl/run_whisper.sh
 ```
 5. Check the device ID in output
 Like:
 ```
 Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 ## Environment Variable
 #### Build
 |Name|Value|Function|
 |-|-|-|
 |WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
 |WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
 |CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
 |CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
 #### Running
 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 ## Known Issue
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
  Miss to enable oneAPI running environment.
  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
 - Hang during startup
  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
  Solution: add **--no-mmap**.
 ## Todo
 - Support to build in Windows.
 - Support multiple cards.
--- a/bindings/go/examples/go-whisper/flags.go
+++ b/bindings/go/examples/go-whisper/flags.go
@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
 	return strings.ToLower(flags.Lookup("out").Value.String())
 }
 func (flags *Flags) IsSpeedup() bool {
 	return flags.Lookup("speedup").Value.String() == "true"
 }
 func (flags *Flags) IsTokens() bool {
 	return flags.Lookup("tokens").Value.String() == "true"
 }
@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
 		fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
 		context.SetDuration(duration)
 	}
 	if flags.IsSpeedup() {
 		fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
 		context.SetSpeedup(true)
 	}
 	if threads := flags.GetThreads(); threads != 0 {
 		fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
 		context.SetThreads(threads)
@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
 	flag.Duration("offset", 0, "Time offset")
 	flag.Duration("duration", 0, "Duration of audio to process")
 	flag.Uint("threads", 0, "Number of threads to use")
 	flag.Bool("speedup", false, "Enable speedup")
 	flag.Uint("max-len", 0, "Maximum segment length in characters")
 	flag.Uint("max-tokens", 0, "Maximum tokens per segment")
 	flag.Float64("word-thold", 0, "Maximum segment score")
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
 	p.print_timestamps = toBool(v)
 }
 func (p *Params) SetSpeedup(v bool) {
 	p.speed_up = toBool(v)
 }
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
 	if lang == -1 {
@ -177,9 +173,6 @@ func (p *Params) String() string {
 	if p.token_timestamps {
 		str += " token_timestamps"
 	}
 	if p.speed_up {
 		str += " speed_up"
 	}
 	return str + ">"
 }
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
 	context.params.SetTranslate(v)
 }
 // Set speedup flag
 func (context *context) SetSpeedup(v bool) {
 	context.params.SetSpeedup(v)
 }
 func (context *context) SetSplitOnWord(v bool) {
 	context.params.SetSplitOnWord(v)
 }
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -41,7 +41,6 @@ type Context interface {
 	SetOffset(time.Duration)        // Set offset
 	SetDuration(time.Duration)      // Set duration
 	SetThreads(uint)                // Set number of threads to use
 	SetSpeedup(bool)                // Set speedup flag
 	SetSplitOnWord(bool)            // Set split on word flag
 	SetTokenThreshold(float32)      // Set timestamp token probability threshold
 	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -10,7 +10,7 @@ import (
 /*
 #cgo LDFLAGS: -lwhisper -lm -lstdc++
-#cgo darwin LDFLAGS: -framework Accelerate
+#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
 #include <whisper.h>
 #include <stdlib.h>
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
     * @return Whisper context on success, null on failure
     */
    Pointer whisper_init_from_file(String path_model);
-    
+
    /**
     * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
     * Because this function allocates memory for the params, the caller must call either:
@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
    /** Language id associated with the provided state */
    int whisper_full_lang_id_from_state(Pointer state);
    /**
     * Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
     * The resulting spectrogram is stored inside the default state of the provided whisper context.
     * @return 0 on success
     */
    int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);
    int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);
    /** Get the start time of the specified segment. */
    long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@ -129,14 +129,6 @@ public class WhisperFullParams extends Structure {
    /** Maximum tokens per segment (0, default = no limit) */
    public int max_tokens;
    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
    public CBool speed_up;
    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
    public void speedUp(boolean enable) {
        speed_up = enable ? CBool.TRUE : CBool.FALSE;
    }
    /** Overwrite the audio context size (0 = use default). */
    public int audio_ctx;
@ -148,6 +140,9 @@ public class WhisperFullParams extends Structure {
        tdrz_enable = enable ? CBool.TRUE : CBool.FALSE;
    }
    /** Regular expression matching tokens to suppress. */
    public String suppress_regex;
    /** Tokens to provide to the whisper decoder as an initial prompt.
     * These are prepended to any existing text context from a previous call. */
    public String initial_prompt;
@ -318,8 +313,8 @@ public class WhisperFullParams extends Structure {
        return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
                "no_context", "single_segment", "no_timestamps",
                "print_special", "print_progress", "print_realtime", "print_timestamps",  "token_timestamps",
-                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
+                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
-                "tdrz_enable", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
+                "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
                "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
                "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
                "new_segment_callback", "new_segment_callback_user_data",
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.5.4",
+  "version": "1.6.2",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/Rakefile
+++ b/bindings/ruby/Rakefile
@ -0,0 +1,12 @@
 require 'rake/clean'
  require 'rubygems/package'
 desc 'Build gem'
 task :package do
  spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
  spec = nil
  # see: http://gist.github.com/16215
  Thread.new { spec = eval("#{spec_source}") }.join
  spec.validate
  Gem::Package.build(spec)
 end
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -1,6 +1,7 @@
 require 'mkmf'
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
@ -9,6 +10,7 @@ system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} ."
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
--- a/bindings/ruby/ext/ggml-backend-impl.h
+++ b/bindings/ruby/ext/ggml-backend-impl.h
@ -12,31 +12,63 @@ extern "C" {
    // Backend buffer
    //
    // buffer type
    typedef void * ggml_backend_buffer_type_context_t;
    struct ggml_backend_buffer_type_i {
        const char *          (*GGML_CALL get_name)        (ggml_backend_buffer_type_t buft);
        ggml_backend_buffer_t (*GGML_CALL alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
        size_t                (*GGML_CALL get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
        size_t                (*GGML_CALL get_max_size)    (ggml_backend_buffer_type_t buft); // allocation max size
        size_t                (*GGML_CALL get_alloc_size)  (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
        bool                  (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
        // check if tensor data is in host memory
        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
        bool                  (*GGML_CALL is_host)         (ggml_backend_buffer_type_t buft);
    };
    struct ggml_backend_buffer_type {
        struct ggml_backend_buffer_type_i  iface;
        ggml_backend_buffer_type_context_t context;
    };
    // buffer
    typedef void * ggml_backend_buffer_context_t;
    struct ggml_backend_buffer_i {
-        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
+        const char * (*GGML_CALL get_name)   (ggml_backend_buffer_t buffer);
-        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
+        void         (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
-        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
+        void *       (*GGML_CALL get_base)   (ggml_backend_buffer_t buffer);
-        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
+        void         (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+        void         (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void         (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool         (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
        void         (*GGML_CALL clear)      (ggml_backend_buffer_t buffer, uint8_t value);
        void         (*GGML_CALL reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
    };
    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i iface;
+        struct ggml_backend_buffer_i  iface;
-
+        ggml_backend_buffer_type_t    buft;
        ggml_backend_t                backend;
        ggml_backend_buffer_context_t context;
        size_t size;
        enum ggml_backend_buffer_usage usage;
    };
-    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+    GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
-            struct ggml_backend                  * backend,
+                   ggml_backend_buffer_type_t      buft,
            struct ggml_backend_buffer_i           iface,
                   ggml_backend_buffer_context_t   context,
                   size_t                          size);
    // do not use directly, use ggml_backend_tensor_copy instead
    bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
    // buffer that contains a collection of buffers
    GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
    GGML_CALL bool                  ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
    GGML_CALL void                  ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    //
    // Backend
    //
@ -44,44 +76,66 @@ extern "C" {
    typedef void * ggml_backend_context_t;
    struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
+        const char * (*GGML_CALL get_name)(ggml_backend_t backend);
-        void (*free)(ggml_backend_t backend);
+        void (*GGML_CALL free)(ggml_backend_t backend);
        // buffer allocation
-        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+        ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
-        // get buffer alignment
+        // (optional) asynchronous tensor data access
-        size_t (*get_alignment)(ggml_backend_t backend);
+        void (*GGML_CALL set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
-        // tensor data access
+        // (optional) complete all pending operations
-        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        void (*GGML_CALL synchronize)(ggml_backend_t backend);
        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
        void (*synchronize)     (ggml_backend_t backend);
-        // (optional) copy tensor between different backends, allow for single-copy tranfers
+        // compute graph with a plan (not used currently)
-        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
-        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void                      (*GGML_CALL graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
        // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        // compute graph without a plan (async)
-        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        enum ggml_status (*GGML_CALL graph_compute)     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // compute graph without a plan
        bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
        // check if the backend supports an operation
-        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+        bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
        // these should be expensive operations with large batch sizes that may benefit from running on this backend
        // even if the weight has to be copied from the CPU temporarily
        bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
        // (optional) event synchronization
        ggml_backend_event_t (*GGML_CALL event_new)         (ggml_backend_t backend);
        void                 (*GGML_CALL event_free)        (ggml_backend_event_t event);
        void                 (*GGML_CALL event_record)      (ggml_backend_event_t event);
        void                 (*GGML_CALL event_wait)        (ggml_backend_t backend, ggml_backend_event_t event);
        void                 (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
    };
    struct ggml_backend {
-        struct ggml_backend_i iface;
+        ggml_guid_t guid;
        struct ggml_backend_i iface;
        ggml_backend_context_t context;
    };
    struct ggml_backend_event {
        ggml_backend_t backend;
        void * context;
    };
    //
    // Backend registry
    //
    typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
    GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-backend.c
+++ b/bindings/ruby/ext/ggml-backend.c
--- a/bindings/ruby/ext/ggml-backend.h
+++ b/bindings/ruby/ext/ggml-backend.h
@ -7,69 +7,123 @@
 extern "C" {
 #endif
    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
    typedef struct ggml_backend_event * ggml_backend_event_t;
    typedef struct ggml_backend * ggml_backend_t;
    typedef void * ggml_backend_graph_plan_t;
    //
    // Backend buffer
    //
-    struct ggml_backend_buffer;
+    // buffer type
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    GGML_API           const char *          ggml_backend_buft_name            (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer    (ggml_backend_buffer_type_t buft, size_t size);
    GGML_API           size_t                ggml_backend_buft_get_alignment   (ggml_backend_buffer_type_t buft);
    GGML_API           size_t                ggml_backend_buft_get_max_size    (ggml_backend_buffer_type_t buft);
    GGML_API GGML_CALL size_t                ggml_backend_buft_get_alloc_size  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
    GGML_API           bool                  ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
    GGML_API           bool                  ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
-    // backend buffer functions
+    // buffer
-    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    enum ggml_backend_buffer_usage {
-    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+        GGML_BACKEND_BUFFER_USAGE_ANY = 0,
-    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+        GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
-    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    };
-    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
-    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           const char *               ggml_backend_buffer_name          (ggml_backend_buffer_t buffer);
-    GGML_API void   ggml_backend_buffer_free_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API           void                       ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
    GGML_API           void *                     ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
    GGML_API GGML_CALL void                       ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           size_t                     ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_max_size  (ggml_backend_buffer_t buffer);
    GGML_API           size_t                     ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    GGML_API           void                       ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
    GGML_API           bool                       ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_set_usage     (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
    GGML_API           ggml_backend_buffer_type_t ggml_backend_buffer_get_type      (ggml_backend_buffer_t buffer);
    GGML_API           void                       ggml_backend_buffer_reset         (ggml_backend_buffer_t buffer);
    //
    // Backend
    //
-    struct ggml_backend;
+    GGML_API ggml_guid_t  ggml_backend_guid(ggml_backend_t backend);
    typedef struct ggml_backend * ggml_backend_t;
    typedef void * ggml_backend_graph_plan_t;
    GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
    GGML_API size_t                     ggml_backend_get_max_size(ggml_backend_t backend);
-    GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_set_async(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+    GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
    GGML_API void                      ggml_backend_graph_plan_free  (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API enum ggml_status ggml_backend_graph_compute      (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
+    GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
    GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
    // tensor copy between different backends
    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
    // asynchronous copy
    // the copy is performed after all the currently queued operations in backend_src
    // backend_dst will wait for the copy to complete before performing other operations
    // automatic fallback to sync copy if async is not supported
    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
    // events
    GGML_API ggml_backend_event_t   ggml_backend_event_new        (ggml_backend_t backend);
    GGML_API void                   ggml_backend_event_free       (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_record     (ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_synchronize(ggml_backend_event_t event);
    GGML_API void                   ggml_backend_event_wait       (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
    //
    // CPU backend
    //
    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
+    GGML_API GGML_CALL bool ggml_backend_is_cpu                (ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+    GGML_API           void ggml_backend_cpu_set_n_threads     (ggml_backend_t backend_cpu, int n_threads);
    GGML_API           void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
+    GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
    GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
 #ifdef GGML_USE_CPU_HBM
    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
 #endif
    //
    // Backend registry
    //
    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
    GGML_API size_t                     ggml_backend_reg_get_count(void);
    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
    //
    // Backend scheduler
@ -83,53 +137,96 @@ extern "C" {
    /*
      Example usage:
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
+        // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
-        // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
+        // preferrably to run on the same backend as the buffer
        ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
-        // initialize buffers from a measure graph
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
        measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
-        // in build_graph:
+        // initialize buffers from a max size graph (optional)
-        build_graph(...) {
+        reserve_graph = build_graph(sched, max_batch_size);
            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
            ggml_allocr_alloc(alloc_cpu, tensor);
-            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
+        // manually assign nodes to a backend (optional, should not be needed in most cases)
-            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+        struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
+        ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
        }
-        // allocate backend buffers from measure graph
+        ggml_backend_sched_reserve(sched, reserve_graph);
        ggml_backend_sched_init_measure(sched, measure_graph);
        // the scheduler is now ready to compute graphs
        // compute
        graph = build_graph(sched);
        ggml_backend_sched_graph_compute(sched, graph);
        // if there are graph inputs:
        ggml_backend_sched_reset(sched);
        ggml_backend_sched_alloc_graph(sched, graph);
        ggml_backend_tensor_set(input_tensor, ...);
        ggml_backend_sched_graph_compute(sched, graph);
    }
    */
    struct ggml_backend_sched;
    typedef struct ggml_backend_sched * ggml_backend_sched_t;
-    // Initialize a backend scheduler
+    // when ask == true, the scheduler wants to know if the user wants to observe this node
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
+    // this allows the scheduler to batch nodes together in order to evaluate them in a single call
    //
    // when ask == false, the scheduler is passing the node tensor to the user for observation
    // if the user returns false, the scheduler will cancel the graph compute
    //
    typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
-    GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
+    // Initialize a backend scheduler
    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
    GGML_API void                 ggml_backend_sched_free(ggml_backend_sched_t sched);
    // Initialize backend buffers from a measure graph
-    GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+    GGML_API bool                 ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
+    // Get the number of splits of the last graph
-    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API int                  ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
    GGML_API int                  ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
-    GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
    // Allocate and compute graph on the backend scheduler
    GGML_API bool                 ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API enum ggml_status     ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
    GGML_API void                 ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
    // Reset all assignments and allocators - must be called before changing the node backends
    GGML_API void                 ggml_backend_sched_reset(ggml_backend_sched_t sched);
    // Set a callback to be called for each resulting node during graph compute
    GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
    //
    // Utils
    //
    struct ggml_backend_graph_copy {
        ggml_backend_buffer_t buffer;
        struct ggml_context * ctx_allocated;
        struct ggml_context * ctx_unallocated;
        struct ggml_cgraph * graph;
    };
    // Copy a graph to a different backend
    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
    typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
    // Compare the output of two backends
    GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
    // Tensor initialization
    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
    // Allocate a graph on the backend scheduler
    GGML_API void ggml_backend_sched_graph_compute(
            ggml_backend_sched_t sched,
            struct ggml_cgraph * graph);
 #ifdef  __cplusplus
 }
--- a/bindings/ruby/ext/ggml-common.h
+++ b/bindings/ruby/ext/ggml-common.h
--- a/bindings/ruby/ext/ggml-cuda.h
+++ b/bindings/ruby/ext/ggml-cuda.h
@ -17,29 +17,17 @@ extern "C" {
 #define GGML_CUDA_MAX_DEVICES       16
 // Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
 GGML_API GGML_CALL void   ggml_init_cublas(void);
 // Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
 GGML_API GGML_CALL bool   ggml_cublas_loaded(void);
 GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size);
 GGML_API GGML_CALL void   ggml_cuda_host_free(void * ptr);
 GGML_API GGML_CALL bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
 GGML_API GGML_CALL bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 GGML_API GGML_CALL int    ggml_cuda_get_device_count(void);
 GGML_API GGML_CALL void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
 GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
 // device buffer
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
@ -47,6 +35,9 @@ GGML_API GGML_CALL int  ggml_backend_cuda_get_device_count(void);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
 GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
 GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
 GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-impl.h
+++ b/bindings/ruby/ext/ggml-impl.h
@ -5,6 +5,7 @@
 // GGML internal header
 #include <assert.h>
 #include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
 #include <stddef.h>
 #include <stdbool.h>
 #include <string.h> // memcpy
@ -18,6 +19,7 @@ extern "C" {
 // fall back to the _Static_assert C11 keyword.
 // if C99 - static_assert is noop
 // ref: https://stackoverflow.com/a/53923785/4039976
 #ifndef __cplusplus
 #ifndef static_assert
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
 #define static_assert(cond, msg) _Static_assert(cond, msg)
@ -25,6 +27,7 @@ extern "C" {
 #define static_assert(cond, msg) struct global_scope_noop_trick
 #endif
 #endif
 #endif
 // __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
 #if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
@ -34,16 +37,17 @@ extern "C" {
 #ifndef __F16C__
 #define __F16C__
 #endif
 #endif
 // __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
 #if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
 #ifndef __SSE3__
 #define __SSE3__
 #endif
 #ifndef __SSSE3__
 #define __SSSE3__
 #endif
 #endif
 #undef MIN
 #undef MAX
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 // 16-bit float
 // on Arm, we use __fp16
@ -56,14 +60,30 @@ extern "C" {
 //
 #include <arm_neon.h>
-#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x))
+typedef __fp16 ggml_fp16_internal_t;
 #define GGML_COMPUTE_FP32_TO_FP16(x) (x)
-#define GGML_FP16_TO_FP32(x) ((float) (x))
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) (x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    ggml_fp16_internal_t tmp;
    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
    return (float)tmp;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    ggml_fp16_t res;
    ggml_fp16_internal_t tmp = f;
    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
    return res;
 }
 #else
 typedef uint16_t ggml_fp16_internal_t;
 #ifdef __wasm_simd128__
 #include <wasm_simd128.h>
 #else
@ -217,8 +237,7 @@ extern float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
-#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)
+#if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
@ -226,19 +245,23 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
-#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
+#endif
 #if !defined(GGML_FP32_TO_FP16)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 #define GGML_HASHTABLE_FULL ((size_t)-1)
 #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
 struct ggml_hash_set ggml_hash_set_new(size_t size);
 bool   ggml_hash_contains      (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
 size_t ggml_hash_find          (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
-// returns GGML_HAHSHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
+// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
 size_t ggml_hash_insert        (      struct ggml_hash_set hash_set, struct ggml_tensor * key);
 // return index, asserts if table is full
--- a/bindings/ruby/ext/ggml-kompute.h
+++ b/bindings/ruby/ext/ggml-kompute.h
--- a/bindings/ruby/ext/ggml-metal.h
+++ b/bindings/ruby/ext/ggml-metal.h
--- a/bindings/ruby/ext/ggml-opencl.h
+++ b/bindings/ruby/ext/ggml-opencl.h
--- a/bindings/ruby/ext/ggml-quants.c
+++ b/bindings/ruby/ext/ggml-quants.c
--- a/bindings/ruby/ext/ggml-quants.h
+++ b/bindings/ruby/ext/ggml-quants.h
@ -1,224 +1,133 @@
 #pragma once
-#include "ggml-impl.h"
+#define GGML_COMMON_DECL_C
 #include "ggml-common.h"
 #include "ggml.h"
 // GGML internal header
-#include <stdint.h>
+#ifdef __cplusplus
-#include <stddef.h>
+extern "C" {
 #define QK4_0 32
 typedef struct {
    ggml_fp16_t d;          // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
 static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
 #define QK4_1 32
 typedef struct {
    ggml_fp16_t d;          // delta
    ggml_fp16_t m;          // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
 #define QK5_0 32
 typedef struct {
    ggml_fp16_t d;         // delta
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_0 / 2]; // nibbles / quants
 } block_q5_0;
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
 #define QK5_1 32
 typedef struct {
    ggml_fp16_t d;         // delta
    ggml_fp16_t m;         // min
    uint8_t qh[4];         // 5-th bit of quants
    uint8_t qs[QK5_1 / 2]; // nibbles / quants
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 #define QK8_0 32
 typedef struct {
    ggml_fp16_t d;         // delta
    int8_t  qs[QK8_0];     // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
 #define QK8_1 32
 typedef struct {
    float d;               // delta
    float s;               // d * sum(qs[i])
    int8_t  qs[QK8_1];     // quants
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
 //
 // Super-block quantization structures
 //
 // Super-block size
 #ifdef GGML_QKK_64
 #define QK_K 64
 #define K_SCALE_SIZE 4
 #else
 #define QK_K 256
 #define K_SCALE_SIZE 12
 #endif
 // 2-bit quantization
 // weight is represented as x = a * q + b
 // 16 blocks of 16 elements each
 // Effectively 2.5625 bits per weight
 typedef struct {
    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
    uint8_t qs[QK_K/4];      // quants
    ggml_fp16_t d;           // super-block scale for quantized scales
    ggml_fp16_t dmin;        // super-block scale for quantized mins
 } block_q2_K;
 static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
 // 3-bit quantization
 // weight is represented as x = a * q
 // 16 blocks of 16 elements each
 // Effectively 3.4375 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
    uint8_t hmask[QK_K/8];     // quants - high bit
    uint8_t qs[QK_K/4];        // quants - low 2 bits
    uint8_t scales[2];
    ggml_fp16_t d;             // super-block scale
 } block_q3_K;
 static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
 #else
 typedef struct {
    uint8_t hmask[QK_K/8];     // quants - high bit
    uint8_t qs[QK_K/4];        // quants - low 2 bits
    uint8_t scales[12];        // scales, quantized with 6 bits
    ggml_fp16_t d;             // super-block scale
 } block_q3_K;
 static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
 #endif
 // 4-bit quantization
 // 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 4.5 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
    ggml_fp16_t d[2];          // super-block scales/mins
    uint8_t scales[2];         // 4-bit block scales/mins
    uint8_t qs[QK_K/2];        // 4--bit quants
 } block_q4_K;
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
 #else
 typedef struct {
    ggml_fp16_t d;             // super-block scale for quantized scales
    ggml_fp16_t dmin;          // super-block scale for quantized mins
    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
    uint8_t qs[QK_K/2];        // 4--bit quants
 } block_q4_K;
 static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
 #endif
 // 5-bit quantization
 // 8 blocks of 32 elements each
 // weight is represented as x = a * q + b
 // Effectively 5.5 bits per weight
 #ifdef GGML_QKK_64
 typedef struct {
    ggml_fp16_t d;               // super-block scale
    int8_t  scales[QK_K/16];     // 8-bit block scales
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
 } block_q5_K;
 static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
 #else
 typedef struct {
    ggml_fp16_t d;               // super-block scale for quantized scales
    ggml_fp16_t dmin;            // super-block scale for quantized mins
    uint8_t scales[K_SCALE_SIZE];   // scales and mins, quantized with 6 bits
    uint8_t qh[QK_K/8];          // quants, high bit
    uint8_t qs[QK_K/2];          // quants, low 4 bits
 } block_q5_K;
 static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
 #endif
 // 6-bit quantization
 // weight is represented as x = a * q
 // 16 blocks of 16 elements each
 // Effectively 6.5625 bits per weight
 typedef struct {
    uint8_t ql[QK_K/2];      // quants, lower 4 bits
    uint8_t qh[QK_K/4];      // quants, upper 2 bits
    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
    ggml_fp16_t d;           // super-block scale
 } block_q6_K;
 static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
 // This is only used for intermediate quantization and dot products
 typedef struct {
    float   d;              // delta
    int8_t  qs[QK_K];       // quants
    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
 } block_q8_K;
 static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
 // Quantization
-void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k);
+void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k);
+void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k);
+void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k);
+void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k);
+void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k);
+void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
-void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k);
+void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k);
+void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k);
+void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
+void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
+void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
+void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl  * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs  * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_1(const float * restrict x, void * restrict y, int k);
+void quantize_row_iq3_s_reference  (const float * GGML_RESTRICT x, block_iq3_s   * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_0(const float * restrict x, void * restrict y, int k);
+void quantize_row_iq2_s_reference  (const float * GGML_RESTRICT x, block_iq2_s   * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_1(const float * restrict x, void * restrict y, int k);
-void quantize_row_q2_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q3_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
-void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
+void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq3_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 void quantize_row_iq2_s  (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 // Dequantization
-void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k);
+void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k);
+void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k);
+//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k);
+void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k);
+void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k);
+void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k);
+void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k);
+void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k);
+void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_xs (const block_iq2_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq2_s  (const block_iq2_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq1_s  (const block_iq1_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq1_m  (const block_iq1_m   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq4_nl (const block_iq4_nl  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq4_xs (const block_iq4_xs  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 void dequantize_row_iq3_s  (const block_iq3_s   * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
 // Dot product
-void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
+void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq2_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq1_m_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 void ggml_vec_dot_iq3_s_q8_K  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
 size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq2_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq1_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq1_m  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_iq3_s  (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
 void iq2xs_init_impl(enum ggml_type type);
 void iq2xs_free_impl(enum ggml_type type);
 void iq3xs_init_impl(int grid_size);
 void iq3xs_free_impl(int grid_size);
 #ifdef __cplusplus
 }
 #endif
 void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
 void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
--- a/bindings/ruby/ext/ggml-sycl.h
+++ b/bindings/ruby/ext/ggml-sycl.h
@ -0,0 +1,49 @@
 //
 //  MIT license
 //  Copyright (C) 2024 Intel Corporation
 //  SPDX-License-Identifier: MIT
 //
 #pragma once
 #include "ggml.h"
 #include "ggml-backend.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define GGML_SYCL_MAX_DEVICES       48
 #define GGML_SYCL_NAME "SYCL"
 // backend API
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 // devide buffer
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 // split tensor buffer that splits matrices by rows across multiple devices
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
 // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 GGML_API GGML_CALL void   ggml_sycl_get_gpu_list(int *id_list, int max_len);
 GGML_API GGML_CALL void   ggml_sycl_get_device_description(int device, char *description, size_t description_size);
 GGML_API GGML_CALL int   ggml_backend_sycl_get_device_count();
 GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
 // TODO: these are temporary
 //       ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
 GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
 GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
 GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
 // SYCL doesn't support registering host memory, keep here for reference
 // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
 // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
 #ifdef  __cplusplus
 }
 #endif
--- a/bindings/ruby/ext/ggml-vulkan.h
+++ b/bindings/ruby/ext/ggml-vulkan.h
@ -10,17 +10,7 @@ extern "C" {
 #define GGML_VK_NAME "Vulkan"
 #define GGML_VK_MAX_DEVICES 16
-GGML_API void ggml_vk_init_cpu_assist(void);
+GGML_API void ggml_vk_instance_init(void);
 GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
 GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
 GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
 GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #ifdef GGML_VULKAN_CHECK_RESULTS
 void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
 #endif
 GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
 GGML_API void ggml_vk_free_cpu_assist(void);
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
 static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
  BOOL_PARAMS_SETTER(self, split_on_word, value)
 }
 static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
  BOOL_PARAMS_GETTER(self, speed_up)
 }
 static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
  BOOL_PARAMS_SETTER(self, speed_up, value)
 }
 static VALUE ruby_whisper_params_get_diarize(VALUE self) {
  ruby_whisper_params *rwp;
  Data_Get_Struct(self, ruby_whisper_params, rwp);
@ -408,8 +402,6 @@ void Init_whisper() {
  rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
  rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
  rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
  rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
  rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
  rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
  rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -117,13 +117,6 @@ class TestWhisper < Test::Unit::TestCase
    assert !@params.split_on_word
  end
  def test_speed_up
    @params.speed_up = true
    assert @params.speed_up
    @params.speed_up = false
    assert !@params.speed_up
  end
  def test_whisper
    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    params  = Whisper::Params.new
--- a/bindings/ruby/whispercpp.gemspec
+++ b/bindings/ruby/whispercpp.gemspec
@ -0,0 +1,28 @@
 Gem::Specification.new do |s|
  s.name    = "whispercpp"
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
  s.version = '1.3.0'
  s.date    = '2024-05-14'
  s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
  s.email   = 'todd.fisher@gmail.com'
  s.extra_rdoc_files = ['LICENSE', 'README.md']
  s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
  #### Load-time details
  s.require_paths = ['lib','ext']
  s.summary = %q{Ruby whisper.cpp bindings}
  s.test_files = ["tests/test_whisper.rb"]
  s.extensions << 'ext/extconf.rb'
  #### Documentation and testing.
  s.homepage = 'https://github.com/ggerganov/whisper.cpp'
  s.rdoc_options = ['--main', '../../README.md']
    s.platform = Gem::Platform::RUBY
  s.licenses = ['MIT']
 end
--- a/cmake/BuildTypes.cmake
+++ b/cmake/BuildTypes.cmake
@ -1,54 +0,0 @@
 # Add new build types
 # ReleaseGG - Release with enabled asserts
 SET(CMAKE_CXX_FLAGS_RELEASEGG
    "-O3"
    CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
    FORCE )
 SET(CMAKE_C_FLAGS_RELEASEGG
    "-O3"
    CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
    FORCE )
 SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
    ""
    CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
    FORCE )
 SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
    ""
    CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
    FORCE )
 MARK_AS_ADVANCED(
    CMAKE_CXX_FLAGS_RELEASEGG
    CMAKE_C_FLAGS_RELEASEGG
    CMAKE_EXE_LINKER_FLAGS_RELEASEGG
    CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
 # RelWithDebInfoGG - RelWithDebInfo with enabled asserts
 SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
    "-O2 -g"
    CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
    FORCE )
 SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
    "-O2 -g"
    CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
    FORCE )
 SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
    ""
    CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
    FORCE )
 SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
    ""
    CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
    FORCE )
 MARK_AS_ADVANCED(
    CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
    CMAKE_C_FLAGS_RELWITHDEBINFOGG
    CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
    CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
 endif()
--- a/cmake/FindFFmpeg.cmake
+++ b/cmake/FindFFmpeg.cmake
@ -0,0 +1,163 @@
 # From
 # https://github.com/snikulov/cmake-modules/blob/master/FindFFmpeg.cmake
 #
 # vim: ts=2 sw=2
 # - Try to find the required ffmpeg components(default: AVFORMAT, AVUTIL, AVCODEC)
 #
 # Once done this will define
 #  FFMPEG_FOUND         - System has the all required components.
 #  FFMPEG_INCLUDE_DIRS  - Include directory necessary for using the required components headers.
 #  FFMPEG_LIBRARIES     - Link these to use the required ffmpeg components.
 #  FFMPEG_DEFINITIONS   - Compiler switches required for using the required ffmpeg components.
 #
 # For each of the components it will additionally set.
 #   - AVCODEC
 #   - AVDEVICE
 #   - AVFORMAT
 #   - AVFILTER
 #   - AVUTIL
 #   - POSTPROC
 #   - SWSCALE
 # the following variables will be defined
 #  <component>_FOUND        - System has <component>
 #  <component>_INCLUDE_DIRS - Include directory necessary for using the <component> headers
 #  <component>_LIBRARIES    - Link these to use <component>
 #  <component>_DEFINITIONS  - Compiler switches required for using <component>
 #  <component>_VERSION      - The components version
 #
 # Copyright (c) 2006, Matthias Kretz, <kretz@kde.org>
 # Copyright (c) 2008, Alexander Neundorf, <neundorf@kde.org>
 # Copyright (c) 2011, Michael Jansen, <kde@michael-jansen.biz>
 #
 # Redistribution and use is allowed according to the terms of the BSD license.
 # For details see the accompanying COPYING-CMAKE-SCRIPTS file.
 include(FindPackageHandleStandardArgs)
 # The default components were taken from a survey over other FindFFMPEG.cmake files
 if (NOT FFmpeg_FIND_COMPONENTS)
  set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE) 
 endif()
 #
 ### Macro: set_component_found
 #
 # Marks the given component as found if both *_LIBRARIES AND *_INCLUDE_DIRS is present.
 #
 macro(set_component_found _component )
  if (${_component}_LIBRARIES AND ${_component}_INCLUDE_DIRS)
    message(DEBUG "  - ${_component} found.")
    set(${_component}_FOUND TRUE)
  else ()
  message(DEBUG "  - ${_component} not found.")
  endif ()
 endmacro()
 #
 ### Macro: find_component
 #
 # Checks for the given component by invoking pkgconfig and then looking up the libraries and
 # include directories.
 #
 macro(find_component _component _pkgconfig _library _header)
  if (NOT WIN32)
     # use pkg-config to get the directories and then use these values
     # in the FIND_PATH() and FIND_LIBRARY() calls
     find_package(PkgConfig)
     if (PKG_CONFIG_FOUND)
       pkg_check_modules(PC_${_component} ${_pkgconfig})
       message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDEDIR}")
       message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDE_DIRS}")
       message(STATUS "${PC_${_component}_CFLAGS}")
     endif ()
  endif (NOT WIN32)
  find_path(${_component}_INCLUDE_DIRS ${_header}
    HINTS
      ${PC_${_component}_INCLUDEDIR}
      ${PC_${_component}_INCLUDE_DIRS}
    PATH_SUFFIXES
      ffmpeg
  )
  # CMake's default is to search first for shared libraries and then for static libraries.
  # Todo later: add option to prefer static libs over dynamic:
  find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a  
      HINTS
      ${PC_${_component}_LIBDIR}
      ${PC_${_component}_LIBRARY_DIRS}
  )
  set(${_component}_DEFINITIONS  ${PC_${_component}_CFLAGS_OTHER} CACHE STRING "The ${_component} CFLAGS.")
  set(${_component}_VERSION      ${PC_${_component}_VERSION}      CACHE STRING "The ${_component} version number.")
  set_component_found(${_component})
  mark_as_advanced(
    ${_component}_INCLUDE_DIRS
    ${_component}_LIBRARIES
    ${_component}_DEFINITIONS
    ${_component}_VERSION)
 endmacro()
 # Check for cached results. If there are skip the costly part.
 if (NOT FFMPEG_LIBRARIES)
  # Check for all possible component.
  find_component(AVCODEC    libavcodec    avcodec  libavcodec/avcodec.h)
  find_component(AVFORMAT   libavformat   avformat libavformat/avformat.h)
  find_component(AVDEVICE   libavdevice   avdevice libavdevice/avdevice.h)
  #find_component(AVRESAMPLE libavresample avresample libavresample/avresample.h) # old name for swresample
  find_component(AVUTIL     libavutil     avutil   libavutil/avutil.h)
  find_component(AVFILTER   libavfilter   avfilter libavfilter/avfilter.h)
  find_component(SWSCALE    libswscale    swscale  libswscale/swscale.h)
  find_component(POSTPROC   libpostproc   postproc libpostproc/postprocess.h)
  find_component(SWRESAMPLE libswresample swresample libswresample/swresample.h)
  # Check if the required components were found and add their stuff to the FFMPEG_* vars.
  foreach (_component ${FFmpeg_FIND_COMPONENTS})
    if (${_component}_FOUND)
      # message(STATUS "Required component ${_component} present.")
      set(FFMPEG_LIBRARIES   ${FFMPEG_LIBRARIES}   ${${_component}_LIBRARIES})
      set(FFMPEG_DEFINITIONS ${FFMPEG_DEFINITIONS} ${${_component}_DEFINITIONS})
      list(APPEND FFMPEG_INCLUDE_DIRS ${${_component}_INCLUDE_DIRS})
    else ()
      # message(STATUS "Required component ${_component} missing.")
    endif ()
  endforeach ()
  # Build the include path with duplicates removed.
  if (FFMPEG_INCLUDE_DIRS)
    list(REMOVE_DUPLICATES FFMPEG_INCLUDE_DIRS)
  endif ()
  # cache the vars.
  set(FFMPEG_INCLUDE_DIRS ${FFMPEG_INCLUDE_DIRS} CACHE STRING "The FFmpeg include directories." FORCE)
  set(FFMPEG_LIBRARIES    ${FFMPEG_LIBRARIES}    CACHE STRING "The FFmpeg libraries." FORCE)
  set(FFMPEG_DEFINITIONS  ${FFMPEG_DEFINITIONS}  CACHE STRING "The FFmpeg cflags." FORCE)
  mark_as_advanced(FFMPEG_INCLUDE_DIRS
                   FFMPEG_LIBRARIES
                   FFMPEG_DEFINITIONS)
 endif ()
 # Now set the noncached _FOUND vars for the components.
 # whisper.cpp does not need SWSCALE
 foreach (_component AVCODEC AVDEVICE AVFORMAT AVRESAMPLE AVUTIL POSTPROCESS)
  set_component_found(${_component})
 endforeach ()
 # Compile the list of required vars
 set(_FFmpeg_REQUIRED_VARS FFMPEG_LIBRARIES FFMPEG_INCLUDE_DIRS)
 foreach (_component ${FFmpeg_FIND_COMPONENTS})
  list(APPEND _FFmpeg_REQUIRED_VARS ${_component}_LIBRARIES ${_component}_INCLUDE_DIRS)
 endforeach ()
 # Give a nice error message if some of the required vars are missing.
 find_package_handle_standard_args(FFmpeg DEFAULT_MSG ${_FFmpeg_REQUIRED_VARS})
--- a/cmake/build-info.cmake
+++ b/cmake/build-info.cmake
@ -0,0 +1,58 @@
 set(BUILD_NUMBER 0)
 set(BUILD_COMMIT "unknown")
 set(BUILD_COMPILER "unknown")
 set(BUILD_TARGET "unknown")
 # Look for git
 find_package(Git)
 if(NOT Git_FOUND)
    find_program(GIT_EXECUTABLE NAMES git git.exe)
    if(GIT_EXECUTABLE)
        set(Git_FOUND TRUE)
        message(STATUS "Found Git: ${GIT_EXECUTABLE}")
    else()
        message(WARNING "Git not found. Build info will not be accurate.")
    endif()
 endif()
 # Get the commit count and hash
 if(Git_FOUND)
    execute_process(
        COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE HEAD
        OUTPUT_STRIP_TRAILING_WHITESPACE
        RESULT_VARIABLE RES
    )
    if (RES EQUAL 0)
        set(BUILD_COMMIT ${HEAD})
    endif()
    execute_process(
        COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE COUNT
        OUTPUT_STRIP_TRAILING_WHITESPACE
        RESULT_VARIABLE RES
    )
    if (RES EQUAL 0)
        set(BUILD_NUMBER ${COUNT})
    endif()
 endif()
 if(MSVC)
    set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
 else()
    execute_process(
        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
    set(BUILD_COMPILER ${OUT})
    execute_process(
        COMMAND ${CMAKE_C_COMPILER} -dumpmachine
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
    set(BUILD_TARGET ${OUT})
 endif()
--- a/cmake/git-vars.cmake
+++ b/cmake/git-vars.cmake
--- a/cmake/whisper-config.cmake.in
+++ b/cmake/whisper-config.cmake.in
@ -0,0 +1,65 @@
 set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
 set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
 set(GGML_BLAS       @GGML_BLAS@)
 set(GGML_CUDA       @GGML_CUDA@)
 set(GGML_METAL      @GGML_METAL@)
 set(GGML_HIPBLAS    @GGML_HIPBLAS@)
 set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@
 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 # Ensure transient dependencies satisfied
 find_package(Threads REQUIRED)
 if (APPLE AND GGML_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
 endif()
 if (GGML_BLAS)
    find_package(BLAS REQUIRED)
 endif()
 if (GGML_CUDA)
    find_package(CUDAToolkit REQUIRED)
 endif()
 if (GGML_METAL)
    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
    find_library(METAL_FRAMEWORK Metal REQUIRED)
    find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
 endif()
 if (GGML_HIPBLAS)
    find_package(hip REQUIRED)
    find_package(hipblas REQUIRED)
    find_package(rocblas REQUIRED)
 endif()
 find_library(llama_LIBRARY llama
    REQUIRED
    HINTS ${LLAMA_LIB_DIR})
 set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
 set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
 add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
        INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
        INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
        INTERFACE_COMPILE_FEATURES cxx_std_11
        POSITION_INDEPENDENT_CODE ON )
 check_required_components(Llama)
--- a/cmake/whisper.pc.in
+++ b/cmake/whisper.pc.in
@ -0,0 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}
 libdir=${exec_prefix}/lib
 includedir=${prefix}/include
 Name: whisper
 Description: Port of OpenAI's Whisper model in C/C++
 Version: @PROJECT_VERSION@
 Libs: -L${libdir} -lwhisper
 Cflags: -I${includedir}
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -11,7 +11,7 @@ if (WHISPER_SDL2)
    string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)
    message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}")
-    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
+    message(STATUS "SDL2_LIBRARIES    = ${SDL2_LIBRARIES}")
 endif()
 if (WHISPER_CLBLAST)
@ -22,19 +22,51 @@ endif()
 set(TARGET common)
 unset(COMMON_EXTRA_LIBS)
 if (WHISPER_FFMPEG)
    # As of cmake 3.27, there is no official cmake support for FindFFmpeg.
    # Consequnelty we added a FindFFmpeg.cmake script the cmake subfolder:
    # whisper.cpp does not need the full ffmpeg libs, just AVFORMAT AVCODEC AVUTIL SWRESAMPLE
    # libswresample  performs highly optimized audio resampling, rematrixing and sample format conversion operations
    # libavcodec provides a generic encoding/decoding framework and contains multiple decoders and encoders for audio, video and subtitle streams, and several bitstream filters.
    # libavformat provides a generic framework for multiplexing and demultiplexing (muxing and demuxing) audio, video and subtitle streams.
    find_package(FFmpeg REQUIRED)
    if (NOT ${FFMPEG_FOUND})
        message(FATAL_ERROR "Cannot find ffmpeg libs/headers")
    endif()
    message(STATUS "Found ffmpeg libs:       ${FFMPEG_LIBRARIES}")
    message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
    message(STATUS "ffmpeg definitions:      ${FFMPEG_DEFINITIONS}")
    message(STATUS "Found avformat ${AVFORMAT_VERSION}")
    include_directories(${FFMPEG_INCLUDE_DIRS})
    add_compile_definitions(WHISPER_FFMPEG)
    list(APPEND COMMON_EXTRA_LIBS ${FFMPEG_LIBRARIES})
    set(COMMON_SOURCES_FFMPEG ffmpeg-transcode.cpp)
 endif()
 add_library(${TARGET} STATIC
    common.h
    common.cpp
    common-ggml.h
    common-ggml.cpp
    grammar-parser.h
    grammar-parser.cpp
    ${COMMON_SOURCES_FFMPEG}
    )
 include(DefaultTargetOptions)
-target_link_libraries(${TARGET} PRIVATE whisper)
+target_link_libraries(${TARGET} PRIVATE whisper ${COMMON_EXTRA_LIBS})
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 if (WHISPER_SDL2)
    # common-sdl
@ -48,34 +80,67 @@ if (WHISPER_SDL2)
    include(DefaultTargetOptions)
-    target_include_directories(${TARGET} PUBLIC ${SDL2_INCLUDE_DIRS})
+    target_include_directories(${TARGET} PUBLIC  ${SDL2_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES})
+    target_link_libraries     (${TARGET} PRIVATE ${SDL2_LIBRARIES})
    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()
 # add json lib
 add_library(json_cpp INTERFACE)
 target_include_directories(json_cpp INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 # examples
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
    add_subdirectory(whisper.wasm)
    set_target_properties(libmain PROPERTIES FOLDER "libs")
    add_subdirectory(stream.wasm)
    set_target_properties(libstream PROPERTIES FOLDER "libs")
    add_subdirectory(command.wasm)
    set_target_properties(libcommand PROPERTIES FOLDER "libs")
    add_subdirectory(talk.wasm)
    set_target_properties(libtalk PROPERTIES FOLDER "libs")
    add_subdirectory(bench.wasm)
    set_target_properties(libbench PROPERTIES FOLDER "libs")
 elseif(CMAKE_JS_VERSION)
    add_subdirectory(addon.node)
    set_target_properties(addon.node PROPERTIES FOLDER "examples")
 else()
    add_subdirectory(main)
    set_target_properties(main PROPERTIES FOLDER "examples")
 if (WHISPER_SDL2)
    add_subdirectory(stream)
    set_target_properties(stream PROPERTIES FOLDER "examples")
 endif (WHISPER_SDL2)
    add_subdirectory(server)
    set_target_properties(server PROPERTIES FOLDER "examples")
 if (WHISPER_SDL2)
    add_subdirectory(command)
    set_target_properties(command PROPERTIES FOLDER "examples")
 endif (WHISPER_SDL2)
    add_subdirectory(bench)
    set_target_properties(bench PROPERTIES FOLDER "examples")
    add_subdirectory(quantize)
    set_target_properties(quantize PROPERTIES FOLDER "examples")
 if (WHISPER_SDL2)
    add_subdirectory(talk)
    set_target_properties(talk PROPERTIES FOLDER "examples")
    add_subdirectory(talk-llama)
    set_target_properties(talk-llama PROPERTIES FOLDER "examples")
    add_subdirectory(lsp)
    set_target_properties(lsp PROPERTIES FOLDER "examples")
    if (GGML_SYCL)
        add_subdirectory(sycl)
        set_target_properties(sycl PROPERTIES FOLDER "examples")
    endif()
 endif (WHISPER_SDL2)
 endif()
-add_subdirectory(wchess)
+if (WHISPER_SDL2)
    add_subdirectory(wchess)
    set_target_properties(wchess PROPERTIES FOLDER "examples")
 endif (WHISPER_SDL2)
--- a/examples/addon.node/CMakeLists.txt
+++ b/examples/addon.node/CMakeLists.txt
@ -1,4 +1,4 @@
-set(TARGET whisper-addon)
+set(TARGET addon.node)
 # Base settings
 #==================================================================
--- a/examples/addon.node/README.md
+++ b/examples/addon.node/README.md
@ -14,14 +14,14 @@ npm install
 Make sure it is in the project root directory and compiled with make-js.
 ```shell
-npx cmake-js compile -T whisper-addon -B Release
+npx cmake-js compile -T addon.node -B Release
 ```
 For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes.
 > Such as appointing special cmake path:
 > ```shell
-> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon -B Release
+> npx cmake-js compile -c 'xxx/cmake' -T addon.node -B Release
 > ```
 ## Run
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -1,7 +1,7 @@
 const path = require("path");
 const { whisper } = require(path.join(
  __dirname,
-  "../../../build/Release/whisper-addon"
+  "../../../build/Release/addon.node"
 ));
 const { promisify } = require("util");
@ -12,6 +12,12 @@ const whisperParamsMock = {
  model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
  fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
  use_gpu: true,
  flash_attn: false,
  no_prints: true,
  comma_in_time: false,
  translate: true,
  no_timestamps: false,
  audio_ctx: 0,
 };
 describe("Run whisper.node", () => {
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -19,12 +19,12 @@ struct whisper_params {
    int32_t max_len      = 0;
    int32_t best_of      = 5;
    int32_t beam_size    = -1;
    int32_t audio_ctx    = 0;
    float word_thold    = 0.01f;
    float entropy_thold = 2.4f;
    float logprob_thold = -1.0f;
    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
    bool output_txt     = false;
@ -36,7 +36,10 @@ struct whisper_params {
    bool print_colors   = false;
    bool print_progress = false;
    bool no_timestamps  = false;
    bool no_prints      = false;
    bool use_gpu        = true;
    bool flash_attn     = false;
    bool comma_in_time  = true;
    std::string language = "en";
    std::string prompt;
@ -44,6 +47,8 @@ struct whisper_params {
    std::vector<std::string> fname_inp = {};
    std::vector<std::string> fname_out = {};
    std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
 };
 struct whisper_print_user_data {
@ -52,27 +57,6 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false) {
    int64_t msec = t * 10;
    int64_t hr = msec / (1000 * 60 * 60);
    msec = msec - hr * (1000 * 60 * 60);
    int64_t min = msec / (1000 * 60);
    msec = msec - min * (1000 * 60);
    int64_t sec = msec / 1000;
    msec = msec - sec * 1000;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
    return std::string(buf);
 }
 int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }
 void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -104,8 +88,8 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
        if (params.diarize && pcmf32s.size() == 2) {
            const int64_t n_samples = pcmf32s[0].size();
-            const int64_t is0 = timestamp_to_sample(t0, n_samples);
+            const int64_t is0 = timestamp_to_sample(t0, n_samples, WHISPER_SAMPLE_RATE);
-            const int64_t is1 = timestamp_to_sample(t1, n_samples);
+            const int64_t is1 = timestamp_to_sample(t1, n_samples, WHISPER_SAMPLE_RATE);
            double energy0 = 0.0f;
            double energy1 = 0.0f;
@ -141,9 +125,15 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
    }
 }
 void cb_log_disable(enum ggml_log_level, const char *, void *) {}
 int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
-    if (params.fname_inp.empty()) {
+    if (params.no_prints) {
-        fprintf(stderr, "error: no input files specified\n");
+        whisper_log_set(cb_log_disable, NULL);
    }
    if (params.fname_inp.empty() && params.pcmf32.empty()) {
        fprintf(stderr, "error: no input files or audio buffer specified\n");
        return 2;
    }
@ -156,6 +146,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
@ -163,6 +154,14 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
        return 3;
    }
    // if params.pcmf32 is provided, set params.fname_inp to "buffer"
    // this is simpler than further modifications in the code
    if (!params.pcmf32.empty()) {
        fprintf(stderr, "info: using audio buffer as input\n");
        params.fname_inp.clear();
        params.fname_inp.emplace_back("buffer");
    }
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -170,20 +169,25 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
        std::vector<float> pcmf32; // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
-        if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
+        // read the input audio file if params.pcmf32 is not provided
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
+        if (params.pcmf32.empty()) {
-            continue;
+            if (!::read_wav(fname_inp, pcmf32, pcmf32s, params.diarize)) {
                fprintf(stderr, "error: failed to read WAV file '%s'\n", fname_inp.c_str());
                continue;
            }
        } else {
            pcmf32 = params.pcmf32;
        }
        // print system information
-        {
+        if (!params.no_prints) {
            fprintf(stderr, "\n");
            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
        }
        // print some info about the processing
-        {
+        if (!params.no_prints) {
            fprintf(stderr, "\n");
            if (!whisper_is_multilingual(ctx)) {
                if (params.language != "en" || params.translate) {
@ -192,12 +196,13 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
                }
            }
-            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
+            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, params.n_processors,
                    params.language.c_str(),
                    params.translate ? "translate" : "transcribe",
-                    params.no_timestamps ? 0 : 1);
+                    params.no_timestamps ? 0 : 1,
                    params.audio_ctx);
            fprintf(stderr, "\n");
        }
@ -224,14 +229,15 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
-
+            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;
            wparams.initial_prompt   = params.prompt.c_str();
            wparams.no_timestamps    = params.no_timestamps;
            whisper_print_user_data user_data = { &params, &pcmf32s };
            // this callback is called on each new segment
@ -267,8 +273,8 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        result[i].emplace_back(to_timestamp(t0, true));
+        result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
-        result[i].emplace_back(to_timestamp(t1, true));
+        result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
        result[i].emplace_back(text);
    }
@ -319,11 +325,33 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  std::string model = whisper_params.Get("model").As<Napi::String>();
  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
  bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
  bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
  bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
  bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
  int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
  bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
  Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
  std::vector<float> pcmf32_vec;
  if (pcmf32Value.IsTypedArray()) {
    Napi::Float32Array pcmf32 = pcmf32Value.As<Napi::Float32Array>();
    size_t length = pcmf32.ElementLength();
    pcmf32_vec.reserve(length);
    for (size_t i = 0; i < length; i++) {
      pcmf32_vec.push_back(pcmf32[i]);
    }
  }
  params.language = language;
  params.model = model;
  params.fname_inp.emplace_back(input);
  params.use_gpu = use_gpu;
  params.flash_attn = flash_attn;
  params.no_prints = no_prints;
  params.no_timestamps = no_timestamps;
  params.audio_ctx = audio_ctx;
  params.pcmf32 = pcmf32_vec;
  params.comma_in_time = comma_in_time;
  Napi::Function callback = info[1].As<Napi::Function>();
  Worker* worker = new Worker(callback, params);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,7 +1,7 @@
 const path = require("path");
 const { whisper } = require(path.join(
  __dirname,
-  "../../build/Release/whisper-addon"
+  "../../build/Release/addon.node"
 ));
 const { promisify } = require("util");
@ -10,15 +10,27 @@ const whisperAsync = promisify(whisper);
 const whisperParams = {
  language: "en",
  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
-  fname_inp: "../../samples/jfk.wav",
+  fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
  use_gpu: true,
  flash_attn: false,
  no_prints: true,
  comma_in_time: false,
  translate: true,
  no_timestamps: false,
  audio_ctx: 0,
 };
 const arguments = process.argv.slice(2);
 const params = Object.fromEntries(
  arguments.reduce((pre, item) => {
    if (item.startsWith("--")) {
-      return [...pre, item.slice(2).split("=")];
+      const [key, value] = item.slice(2).split("=");
      if (key === "audio_ctx") {
        whisperParams[key] = parseInt(value);
      } else {
        whisperParams[key] = value;
      }
      return pre;
    }
    return pre;
  }, [])
@ -33,5 +45,6 @@ for (const key in params) {
 console.log("whisperParams =", whisperParams);
 whisperAsync(whisperParams).then((result) => {
-  console.log(`Result from whisper: ${result}`);
+  console.log();
  console.log(result);
 });
--- a/examples/addon.node/package.json
+++ b/examples/addon.node/package.json
@ -1,5 +1,5 @@
 {
-  "name": "whisper-addon",
+  "name": "addon.node",
  "version": "0.0.0",
  "description": "",
  "main": "index.js",
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -8,16 +8,17 @@
 // command-line parameters
 struct whisper_params {
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat
+    int32_t what = 0; // what to benchmark: 0 - whisper encoder, 1 - memcpy, 2 - ggml_mul_mat
    std::string model = "models/ggml-base.en.bin";
-    bool use_gpu = true;
+    bool use_gpu    = true;
    bool flash_attn = false;
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -25,10 +26,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        else if (arg == "-t"  || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
+        else if (arg == "-t"  || arg == "--threads")    { params.n_threads  = std::stoi(argv[++i]); }
-        else if (arg == "-m"  || arg == "--model")   { params.model     = argv[++i]; }
+        else if (arg == "-m"  || arg == "--model")      { params.model      = argv[++i]; }
-        else if (arg == "-w"  || arg == "--what")    { params.what      = atoi(argv[++i]); }
+        else if (arg == "-w"  || arg == "--what")       { params.what       = atoi(argv[++i]); }
-        else if (arg == "-ng" || arg == "--no-gpu")  { params.use_gpu   = false; }
+        else if (arg == "-ng" || arg == "--no-gpu")     { params.use_gpu    = false; }
        else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -49,17 +51,20 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
    fprintf(stderr, "  -ng,      --no-gpu      [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,      --flash-attn  [%-7s] enable flash attention\n",                      params.flash_attn ? "true" : "false");
    fprintf(stderr, "                           %-7s  0 - whisper\n",                                 "");
    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
    fprintf(stderr, "\n");
 }
-int whisper_bench_full(const whisper_params & params) {
+static int whisper_bench_full(const whisper_params & params) {
    // whisper init
    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
+
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
--- a/examples/command/README.md
+++ b/examples/command/README.md
@ -37,9 +37,13 @@ https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9
 The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
-# Install SDL2 on Linux
+# Install SDL2
 # On Debian based linux distributions:
 sudo apt-get install libsdl2-dev
 # On Fedora Linux:
 sudo dnf install SDL2 SDL2-devel
 # Install SDL2 on Mac OS
 brew install sdl2
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -22,11 +22,6 @@
 #include <vector>
 #include <map>
 bool file_exists(const std::string & fname) {
    std::ifstream f(fname.c_str());
    return f.good();
 }
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
@ -43,12 +38,12 @@ struct whisper_params {
    grammar_parser::parse_state grammar_parsed;
    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
    bool no_timestamps = true;
    bool use_gpu       = true;
    bool flash_attn    = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -57,11 +52,14 @@ struct whisper_params {
    std::string prompt;
    std::string context;
    std::string grammar;
    // A regular expression that matches tokens to suppress
    std::string suppress_regex;
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -77,11 +75,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else if (arg == "-fa"  || arg == "--flash-attn")    { params.flash_attn    = true; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
@ -90,6 +88,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ctx" || arg == "--context")       { params.context       = argv[++i]; }
        else if (                 arg == "--grammar")       { params.grammar       = argv[++i]; }
        else if (                 arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
        else if (                 arg == "--suppress-regex") { params.suppress_regex = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -114,11 +113,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -ng,        --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,        --flash-attn     [%-7s] flash attention\n",                             params.flash_attn ? "true" : "false");
    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -f FNAME,   --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
@ -127,10 +126,11 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ctx,       --context        [%-7s] sample text to help the transcription\n",       params.context.c_str());
    fprintf(stderr, "  --grammar GRAMMAR            [%-7s] GBNF grammar to guide decoding\n",              params.grammar.c_str());
    fprintf(stderr, "  --grammar-penalty N          [%-7.1f] scales down logits of nongrammar tokens\n",   params.grammar_penalty);
    fprintf(stderr, "  --suppress-regex REGEX       [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str());
    fprintf(stderr, "\n");
 }
-std::string transcribe(
+static std::string transcribe(
                 whisper_context * ctx,
            const whisper_params & params,
        const std::vector<float> & pcmf32,
@ -162,7 +162,6 @@ std::string transcribe(
    wparams.n_threads        = params.n_threads;
    wparams.audio_ctx = params.audio_ctx;
    wparams.speed_up  = params.speed_up;
    wparams.temperature     = 0.4f;
    wparams.temperature_inc = 1.0f;
@ -172,6 +171,8 @@ std::string transcribe(
    wparams.initial_prompt = params.context.data();
    wparams.suppress_regex = params.suppress_regex.c_str();
    const auto & grammar_parsed = params.grammar_parsed;
    auto grammar_rules = grammar_parsed.c_rules();
@ -215,7 +216,7 @@ std::string transcribe(
    return result;
 }
-std::vector<std::string> read_allowed_commands(const std::string & fname) {
+static std::vector<std::string> read_allowed_commands(const std::string & fname) {
    std::vector<std::string> allowed_commands;
    std::ifstream ifs(fname);
@ -237,7 +238,7 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
    return allowed_commands;
 }
-std::vector<std::string> get_words(const std::string &txt) {
+static std::vector<std::string> get_words(const std::string &txt) {
    std::vector<std::string> words;
    std::istringstream iss(txt);
@ -251,7 +252,7 @@ std::vector<std::string> get_words(const std::string &txt) {
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
-int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: guided mode\n", __func__);
@ -366,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
            wparams.n_threads        = params.n_threads;
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            wparams.prompt_tokens    = k_tokens.data();
            wparams.prompt_n_tokens  = k_tokens.size();
@ -463,7 +463,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
 // always-prompt mode
 // transcribe the voice into text after valid prompt
-int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running = true;
    bool ask_prompt = true;
@ -543,7 +543,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
 // general-purpose mode
 // freely transcribe the voice into text
-int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt  = true;
@ -694,7 +694,9 @@ int main(int argc, char ** argv) {
    // whisper init
    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
+
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -736,7 +738,7 @@ int main(int argc, char ** argv) {
    if (!params.grammar.empty()) {
        auto & grammar = params.grammar_parsed;
-        if (file_exists(params.grammar.c_str())) {
+        if (is_file_exist(params.grammar.c_str())) {
            // read grammar from file
            std::ifstream ifs(params.grammar.c_str());
            const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -64,7 +64,14 @@ bool ggml_common_quantize_0(
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
        case GGML_FTYPE_MOSTLY_IQ2_XXS:
        case GGML_FTYPE_MOSTLY_IQ2_XS:
        case GGML_FTYPE_MOSTLY_IQ2_S:
        case GGML_FTYPE_MOSTLY_IQ3_XXS:
        case GGML_FTYPE_MOSTLY_IQ3_S:
        case GGML_FTYPE_MOSTLY_IQ1_S:
        case GGML_FTYPE_MOSTLY_IQ4_NL:
        case GGML_FTYPE_MOSTLY_IQ4_XS:
        case GGML_FTYPE_MOSTLY_IQ1_M:
        case GGML_FTYPE_MOSTLY_BF16:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
@ -85,8 +92,6 @@ bool ggml_common_quantize_0(
    std::vector<ggml_fp16_t> data_f16;
    std::vector<float>       data_f32;
    std::vector<int64_t> hist_all(1 << 4, 0);
    while (true) {
        int32_t n_dims;
        int32_t length;
@ -171,8 +176,6 @@ bool ggml_common_quantize_0(
            work.resize(nelements); // for quantization
            size_t cur_size = 0;
            std::vector<int64_t> hist_cur(1 << 4, 0);
            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q4_1:
@ -185,18 +188,27 @@ bool ggml_common_quantize_0(
                case GGML_TYPE_Q5_K:
                case GGML_TYPE_Q6_K:
                    {
-                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], hist_cur.data(), nullptr);
+                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                case GGML_TYPE_I8:
                case GGML_TYPE_I16:
                case GGML_TYPE_I32:
                case GGML_TYPE_I64:
                case GGML_TYPE_F64:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_Q8_K:
                case GGML_TYPE_IQ2_XXS:
                case GGML_TYPE_IQ2_XS:
                case GGML_TYPE_IQ2_S:
                case GGML_TYPE_IQ3_XXS:
                case GGML_TYPE_IQ3_S:
                case GGML_TYPE_IQ1_S:
                case GGML_TYPE_IQ4_NL:
                case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_BF16:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
@ -207,15 +219,7 @@ bool ggml_common_quantize_0(
            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
            total_size_new += cur_size;
-            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
            for (int i = 0; i < (int) hist_cur.size(); ++i) {
                hist_all[i] += hist_cur[i];
            }
            for (int i = 0; i < (int) hist_cur.size(); ++i) {
                printf("%5.3f ", hist_cur[i] / (float)nelements);
            }
            printf("\n");
        } else {
            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
@ -228,18 +232,5 @@ bool ggml_common_quantize_0(
    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
    {
        int64_t sum_all = 0;
        for (int i = 0; i < (int) hist_all.size(); ++i) {
            sum_all += hist_all[i];
        }
        printf("%s: hist: ", __func__);
        for (int i = 0; i < (int) hist_all.size(); ++i) {
            printf("%5.3f ", hist_all[i] / (float)sum_all);
        }
        printf("\n");
    }
    return true;
 }
--- a/examples/common-sdl.cpp
+++ b/examples/common-sdl.cpp
@ -219,7 +219,7 @@ bool sdl_poll_events() {
            case SDL_QUIT:
                {
                    return false;
-                } break;
+                }
            default:
                break;
        }
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -19,8 +19,18 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 #ifdef _WIN32
 #include <fcntl.h>
 #include <io.h>
 #endif
 #ifdef WHISPER_FFMPEG
 // as implemented in ffmpeg_trancode.cpp only embedded in common lib if whisper built with ffmpeg support
 extern bool ffmpeg_decode_audio(const std::string & ifname, std::vector<uint8_t> & wav_data);
 #endif
 // Function to check if the next argument exists
-std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
+static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
    if (i + 1 < argc && argv[i + 1][0] != '-') {
        return argv[++i];
    } else {
@ -336,7 +346,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
    return tokens;
 }
-std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
+static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
    std::vector<gpt_vocab::id> output;
    std::stringstream ss(input);
    std::string token;
@ -348,7 +358,7 @@ std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, ch
    return output;
 }
-std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
+static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
    if (fpath_test.empty()){
        fprintf(stderr, "%s : No test file found.\n", __func__);
        return std::map<std::string, std::vector<gpt_vocab::id>>();
@ -632,10 +642,14 @@ bool is_wav_buffer(const std::string buf) {
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
-    std::vector<uint8_t> wav_data; // used for pipe input from stdin
+    std::vector<uint8_t> wav_data; // used for pipe input from stdin or ffmpeg decoding output
    if (fname == "-") {
        {
            #ifdef _WIN32
            _setmode(_fileno(stdin), _O_BINARY);
            #endif
            uint8_t buf[1024];
            while (true)
            {
@ -661,27 +675,42 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
        }
    }
    else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
 #if defined(WHISPER_FFMPEG)
        if (ffmpeg_decode_audio(fname, wav_data) != 0) {
            fprintf(stderr, "error: failed to ffmpeg decode '%s' \n", fname.c_str());
            return false;
        }
        if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
            fprintf(stderr, "error: failed to read wav data as wav \n");
            return false;
        }
 #else
        fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
        return false;
 #endif
    }
    if (wav.channels != 1 && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, fname.c_str());
        drwav_uninit(&wav);
        return false;
    }
    if (stereo && wav.channels != 2) {
        fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", __func__, fname.c_str());
        drwav_uninit(&wav);
        return false;
    }
    if (wav.sampleRate != COMMON_SAMPLE_RATE) {
        fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, fname.c_str(), COMMON_SAMPLE_RATE/1000);
        drwav_uninit(&wav);
        return false;
    }
    if (wav.bitsPerSample != 16) {
        fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, fname.c_str());
        drwav_uninit(&wav);
        return false;
    }
@ -836,3 +865,48 @@ void sam_print_usage(int /*argc*/, char ** argv, const sam_params & params) {
    fprintf(stderr, "                        output file (default: %s)\n", params.fname_out.c_str());
    fprintf(stderr, "\n");
 }
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma) {
    int64_t msec = t * 10;
    int64_t hr = msec / (1000 * 60 * 60);
    msec = msec - hr * (1000 * 60 * 60);
    int64_t min = msec / (1000 * 60);
    msec = msec - min * (1000 * 60);
    int64_t sec = msec / 1000;
    msec = msec - sec * 1000;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
    return std::string(buf);
 }
 int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*whisper_sample_rate)/100)));
 }
 bool is_file_exist(const char *fileName)
 {
    std::ifstream infile(fileName);
    return infile.good();
 }
 bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id)
 {
    std::ofstream speak_file(path.c_str());
    if (speak_file.fail()) {
        fprintf(stderr, "%s: failed to open speak_file\n", __func__);
        return false;
    } else {
        speak_file.write(text.c_str(), text.size());
        speak_file.close();
        int ret = system((command + " " + std::to_string(voice_id) + " " + path).c_str());
        if (ret != 0) {
            fprintf(stderr, "%s: failed to speak\n", __func__);
            return false;
        }
    }
    return true;
 }
--- a/examples/common.h
+++ b/examples/common.h
@ -21,7 +21,7 @@ struct gpt_params {
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict    = 200;  // new tokens to predict
    int32_t n_parallel   = 1;    // number of parallel streams
-    int32_t n_batch      = 8;    // batch size for prompt processing
+    int32_t n_batch      = 32;   // batch size for prompt processing
    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU
@ -185,7 +185,7 @@ private:
    // It is assumed that PCM data is normalized to a range from -1 to 1
    bool write_audio(const float * data, size_t length) {
        for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
+            const int16_t intSample = int16_t(data[i] * 32767);
            file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
            dataSize += sizeof(int16_t);
        }
@ -281,3 +281,31 @@ struct sam_params {
 bool sam_params_parse(int argc, char ** argv, sam_params & params);
 void sam_print_usage(int argc, char ** argv, const sam_params & params);
 //
 // Terminal utils
 //
 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
 const std::vector<std::string> k_colors = {
    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
 };
 //
 // Other utils
 //
 // convert timestamp to string, 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false);
 // given a timestamp get the sample
 int timestamp_to_sample(int64_t t, int n_samples, int whisper_sample_rate);
 // check if file exists using ifstream
 bool is_file_exist(const char *fileName);
 // write text to file, and call system("command voice_id file")
 bool speak_with_file(const std::string & command, const std::string & text, const std::string & path, int voice_id);
--- a/examples/ffmpeg-transcode.cpp
+++ b/examples/ffmpeg-transcode.cpp
@ -0,0 +1,350 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /*
 * transcode.c - convert audio file to WAVE
 *
 * Copyright (C) 2019		Andrew Clayton <andrew@digital-domain.net>
 * Copyright (C) 2024       William Tambellini <william.tambellini@gmail.com>
 */
 // Just for conveninent C++ API
 #include <vector>
 #include <string>
 // C
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <sys/mman.h>
 extern "C" {
 #include <libavutil/opt.h>
 #include <libavcodec/avcodec.h>
 #include <libavformat/avformat.h>
 #include <libswresample/swresample.h>
 }
 typedef uint64_t u64;
 typedef int64_t  s64;
 typedef uint32_t u32;
 typedef int32_t  s32;
 typedef uint16_t u16;
 typedef int16_t  s16;
 typedef uint8_t   u8;
 typedef int8_t    s8;
 #define WAVE_SAMPLE_RATE	16000
 #define AVIO_CTX_BUF_SZ		 4096
 static const char* ffmpegLog = getenv("FFMPEG_LOG");
 // Todo: add __FILE__ __LINE__
 #define LOG(...) \
  do { if (ffmpegLog) fprintf(stderr, __VA_ARGS__); } while(0) // C99
 /*
 * WAVE file header based on definition from
 * https://gist.github.com/Jon-Schneider/8b7c53d27a7a13346a643dac9c19d34f
 *
 * We must ensure this structure doesn't have any holes or
 * padding so we can just map it straight to the WAVE data.
 */
 struct wave_hdr {
 	/* RIFF Header: "RIFF" */
 	char riff_header[4];
 	/* size of audio data + sizeof(struct wave_hdr) - 8 */
 	int wav_size;
 	/* "WAVE" */
 	char wav_header[4];
 	/* Format Header */
 	/* "fmt " (includes trailing space) */
 	char fmt_header[4];
 	/* Should be 16 for PCM */
 	int fmt_chunk_size;
 	/* Should be 1 for PCM. 3 for IEEE Float */
 	s16 audio_format;
 	s16 num_channels;
 	int sample_rate;
 	/*
 	 * Number of bytes per second
 	 * sample_rate * num_channels * bit_depth/8
 	 */
 	int byte_rate;
 	/* num_channels * bytes per sample */
 	s16 sample_alignment;
 	/* bits per sample */
 	s16 bit_depth;
 	/* Data Header */
 	/* "data" */
 	char data_header[4];
 	/*
 	 * size of audio
 	 * number of samples * num_channels * bit_depth/8
 	 */
 	int data_bytes;
 } __attribute__((__packed__));
 struct audio_buffer {
 	u8 *ptr;
 	int size; /* size left in the buffer */
 };
 static void set_wave_hdr(wave_hdr& wh, size_t size) {
    memcpy(&wh.riff_header, "RIFF", 4);
    wh.wav_size = size + sizeof(struct wave_hdr) - 8;
    memcpy(&wh.wav_header, "WAVE", 4);
    memcpy(&wh.fmt_header, "fmt ", 4);
    wh.fmt_chunk_size = 16;
    wh.audio_format = 1;
    wh.num_channels = 1;
    wh.sample_rate = WAVE_SAMPLE_RATE;
    wh.sample_alignment = 2;
    wh.bit_depth = 16;
    wh.byte_rate = wh.sample_rate * wh.sample_alignment;
    memcpy(&wh.data_header, "data", 4);
    wh.data_bytes = size;
 }
 static void write_wave_hdr(int fd, size_t size) {
 	struct wave_hdr wh;
    set_wave_hdr(wh, size);
 	write(fd, &wh, sizeof(struct wave_hdr));
 }
 static int map_file(int fd, u8 **ptr, size_t *size)
 {
 	struct stat sb;
 	fstat(fd, &sb);
 	*size = sb.st_size;
    *ptr = (u8*)mmap(NULL, *size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
 	if (*ptr == MAP_FAILED) {
 		perror("mmap");
 		return -1;
 	}
 	return 0;
 }
 static int read_packet(void *opaque, u8 *buf, int buf_size)
 {
    struct audio_buffer *audio_buf = (audio_buffer*)opaque;
 	buf_size = FFMIN(buf_size, audio_buf->size);
 	/* copy internal buffer data to buf */
 	memcpy(buf, audio_buf->ptr, buf_size);
 	audio_buf->ptr += buf_size;
 	audio_buf->size -= buf_size;
 	return buf_size;
 }
 static void convert_frame(struct SwrContext *swr, AVCodecContext *codec,
 			  AVFrame *frame, s16 **data, int *size, bool flush)
 {
 	int nr_samples;
 	s64 delay;
 	u8 *buffer;
 	delay = swr_get_delay(swr, codec->sample_rate);
 	nr_samples = av_rescale_rnd(delay + frame->nb_samples,
 				    WAVE_SAMPLE_RATE, codec->sample_rate,
 				    AV_ROUND_UP);
 	av_samples_alloc(&buffer, NULL, 1, nr_samples, AV_SAMPLE_FMT_S16, 0);
 	/*
 	 * !flush is used to check if we are flushing any remaining
 	 * conversion buffers...
 	 */
 	nr_samples = swr_convert(swr, &buffer, nr_samples,
 				 !flush ? (const u8 **)frame->data : NULL,
 				 !flush ? frame->nb_samples : 0);
    *data = (s16*)realloc(*data, (*size + nr_samples) * sizeof(s16));
 	memcpy(*data + *size, buffer, nr_samples * sizeof(s16));
 	*size += nr_samples;
 	av_freep(&buffer);
 }
 static bool is_audio_stream(const AVStream *stream)
 {
 	if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
 		return true;
 	return false;
 }
 // Return non zero on error, 0 on success
 // audio_buffer: input memory
 // data: decoded output audio data (wav file)
 // size: size of output data
 static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
 {
    LOG("decode_audio: input size: %d\n", audio_buf->size);
 	AVFormatContext *fmt_ctx;
 	AVIOContext *avio_ctx;
 	AVStream *stream;
 	AVCodecContext *codec;
 	AVPacket packet;
 	AVFrame *frame;
 	struct SwrContext *swr;
 	u8 *avio_ctx_buffer;
 	unsigned int i;
 	int stream_index = -1;
 	int err;
    const size_t errbuffsize = 1024;
    char errbuff[errbuffsize];
    av_register_all(); // from avformat. Still a must-have call for ffmpeg v3! (can be skipped for later versions)
    fmt_ctx = avformat_alloc_context();
    avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
    LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);
    avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, audio_buf, &read_packet, NULL, NULL);
 	fmt_ctx->pb = avio_ctx;
    // open the input stream and read header
 	err = avformat_open_input(&fmt_ctx, NULL, NULL, NULL);
 	if (err) {
        LOG("Could not read audio buffer: %d: %s\n", err, av_make_error_string(errbuff, errbuffsize, err));
        return err;
 	}
 	err = avformat_find_stream_info(fmt_ctx, NULL);
 	if (err < 0) {
        LOG("Could not retrieve stream info from audio buffer: %d\n", err);
        return err;
 	}
 	for (i = 0; i < fmt_ctx->nb_streams; i++) {
 		if (is_audio_stream(fmt_ctx->streams[i])) {
 			stream_index = i;
 			break;
 		}
 	}
 	if (stream_index == -1) {
        LOG("Could not retrieve audio stream from buffer\n");
 		return -1;
 	}
 	stream = fmt_ctx->streams[stream_index];
 	codec = avcodec_alloc_context3(
 			avcodec_find_decoder(stream->codecpar->codec_id));
 	avcodec_parameters_to_context(codec, stream->codecpar);
 	err = avcodec_open2(codec, avcodec_find_decoder(codec->codec_id),
 							NULL);
 	if (err) {
        LOG("Failed to open decoder for stream #%d in audio buffer\n", stream_index);
        return err;
 	}
 	/* prepare resampler */
 	swr = swr_alloc();
 	av_opt_set_int(swr, "in_channel_count", codec->channels, 0);
 	av_opt_set_int(swr, "out_channel_count", 1, 0);
 	av_opt_set_int(swr, "in_channel_layout", codec->channel_layout, 0);
 	av_opt_set_int(swr, "out_channel_layout", AV_CH_LAYOUT_MONO, 0);
 	av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0);
 	av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0);
 	av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0);
 	av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
 	swr_init(swr);
 	if (!swr_is_initialized(swr)) {
        LOG("Resampler has not been properly initialized\n");
 		return -1;
 	}
 	av_init_packet(&packet);
 	frame = av_frame_alloc();
 	if (!frame) {
        LOG("Error allocating the frame\n");
 		return -1;
 	}
 	/* iterate through frames */
 	*data = NULL;
 	*size = 0;
 	while (av_read_frame(fmt_ctx, &packet) >= 0) {
 		avcodec_send_packet(codec, &packet);
 		err = avcodec_receive_frame(codec, frame);
 		if (err == AVERROR(EAGAIN))
 			continue;
 		convert_frame(swr, codec, frame, data, size, false);
 	}
 	/* Flush any remaining conversion buffers... */
 	convert_frame(swr, codec, frame, data, size, true);
 	av_frame_free(&frame);
 	swr_free(&swr);
    //avio_context_free(); // todo?
 	avcodec_close(codec);
 	avformat_close_input(&fmt_ctx);
 	avformat_free_context(fmt_ctx);
 	if (avio_ctx) {
 		av_freep(&avio_ctx->buffer);
 		av_freep(&avio_ctx);
 	}
 	return 0;
 }
 // in mem decoding/conversion/resampling:
 // ifname: input file path
 // owav_data: in mem wav file. Can be forwarded as it to whisper/drwav
 // return 0 on success
 int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_data) {
    LOG("ffmpeg_decode_audio: %s\n", ifname.c_str());
    int ifd = open(ifname.c_str(), O_RDONLY);
    if (ifd == -1) {
        fprintf(stderr, "Couldn't open input file %s\n", ifname.c_str());
        return -1;
    }
    u8 *ibuf = NULL;
    size_t ibuf_size;
    int err = map_file(ifd, &ibuf, &ibuf_size);
    if (err) {
        LOG("Couldn't map input file %s\n", ifname.c_str());
        return err;
    }
    LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
    struct audio_buffer inaudio_buf;
    inaudio_buf.ptr = ibuf;
    inaudio_buf.size = ibuf_size;
    s16 *odata=NULL;
    int osize=0;
    err = decode_audio(&inaudio_buf, &odata, &osize);
    LOG("decode_audio returned %d \n", err);
    if (err != 0) {
        LOG("decode_audio failed\n");
        return err;
    }
    LOG("decode_audio output size: %d\n", osize);
    wave_hdr wh;
    const size_t outdatasize = osize * sizeof(s16);
    set_wave_hdr(wh, outdatasize);
    owav_data.resize(sizeof(wave_hdr) + outdatasize);
    // header:
    memcpy(owav_data.data(), &wh, sizeof(wave_hdr));
    // the data:
    memcpy(owav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16));
    return 0;
 }
--- a/examples/grammar-parser.cpp
+++ b/examples/grammar-parser.cpp
@ -9,7 +9,7 @@
 namespace grammar_parser {
    // NOTE: assumes valid utf8 (but checks for overrun)
    // copied from whisper.cpp
-    std::pair<uint32_t, const char *> decode_utf8(const char * src) {
+    static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
        static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
        uint8_t  first_byte = static_cast<uint8_t>(*src);
        uint8_t  highbits   = first_byte >> 4;
@ -24,19 +24,19 @@ namespace grammar_parser {
        return std::make_pair(value, pos);
    }
-    uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
+    static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
        return result.first->second;
    }
-    uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
+    static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
        uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
        state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
        return next_id;
    }
-    void add_rule(
+    static void add_rule(
            parse_state & state,
            uint32_t      rule_id,
            const std::vector<whisper_grammar_element> & rule) {
@ -46,11 +46,11 @@ namespace grammar_parser {
        state.rules[rule_id] = rule;
    }
-    bool is_word_char(char c) {
+    static bool is_word_char(char c) {
        return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
    }
-    std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
+    static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
        const char * pos   = src;
        const char * end   = src + size;
        uint32_t     value = 0;
@ -73,7 +73,7 @@ namespace grammar_parser {
        return std::make_pair(value, pos);
    }
-    const char * parse_space(const char * src, bool newline_ok) {
+    static const char * parse_space(const char * src, bool newline_ok) {
        const char * pos = src;
        while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
                (newline_ok && (*pos == '\r' || *pos == '\n'))) {
@ -88,7 +88,7 @@ namespace grammar_parser {
        return pos;
    }
-    const char * parse_name(const char * src) {
+    static const char * parse_name(const char * src) {
        const char * pos = src;
        while (is_word_char(*pos)) {
            pos++;
@ -99,7 +99,7 @@ namespace grammar_parser {
        return pos;
    }
-    std::pair<uint32_t, const char *> parse_char(const char * src) {
+    static std::pair<uint32_t, const char *> parse_char(const char * src) {
        if (*src == '\\') {
            switch (src[1]) {
                case 'x': return parse_hex(src + 2, 2);
@ -122,14 +122,14 @@ namespace grammar_parser {
        throw std::runtime_error("unexpected end of input");
    }
-    const char * parse_alternates(
+    static const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
            uint32_t            rule_id,
            bool                is_nested);
-    const char * parse_sequence(
+    static const char * parse_sequence(
            parse_state                        & state,
            const char                         * src,
            const std::string                  & rule_name,
@ -190,7 +190,7 @@ namespace grammar_parser {
                pos = parse_space(pos + 1, is_nested);
            } else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
                if (last_sym_start == out_elements.size()) {
-                    throw std::runtime_error(std::string("expecting preceeding item to */+/? at ") + pos);
+                    throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
                }
                // apply transformation to previous symbol (last_sym_start to end) according to
@ -229,7 +229,7 @@ namespace grammar_parser {
        return pos;
    }
-    const char * parse_alternates(
+    static const char * parse_alternates(
            parse_state       & state,
            const char        * src,
            const std::string & rule_name,
@ -247,7 +247,7 @@ namespace grammar_parser {
        return pos;
    }
-    const char * parse_rule(parse_state & state, const char * src) {
+    static const char * parse_rule(parse_state & state, const char * src) {
        const char * name_end = parse_name(src);
        const char * pos      = parse_space(name_end, false);
        size_t       name_len = name_end - src;
@ -285,7 +285,7 @@ namespace grammar_parser {
        }
    }
-    void print_grammar_char(FILE * file, uint32_t c) {
+    static void print_grammar_char(FILE * file, uint32_t c) {
        if (0x20 <= c && c <= 0x7f) {
            fprintf(file, "%c", static_cast<char>(c));
        } else {
@ -294,7 +294,7 @@ namespace grammar_parser {
        }
    }
-    bool is_char_element(whisper_grammar_element elem) {
+    static bool is_char_element(whisper_grammar_element elem) {
        switch (elem.type) {
            case WHISPER_GRETYPE_CHAR:           return true;
            case WHISPER_GRETYPE_CHAR_NOT:       return true;
@ -304,7 +304,7 @@ namespace grammar_parser {
        }
    }
-    void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
+    static void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
        for (auto elem : rule) {
            switch (elem.type) {
                case WHISPER_GRETYPE_END:            fprintf(file, "END");            break;
@ -334,7 +334,7 @@ namespace grammar_parser {
        fprintf(file, "\n");
    }
-    void print_rule(
+    static void print_rule(
            FILE     * file,
            uint32_t   rule_id,
            const std::vector<whisper_grammar_element> & rule,
@ -413,7 +413,7 @@ namespace grammar_parser {
        }
    }
-    std::vector<const whisper_grammar_element *> parse_state::c_rules() const{
+    std::vector<const whisper_grammar_element *> parse_state::c_rules() const {
        std::vector<const whisper_grammar_element *> ret;
        for (const auto & rule : rules) {
            ret.push_back(rule.data());
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -34,9 +34,6 @@ async function fetchRemote(url, cbProgress, cbPrint) {
        url,
        {
            method: 'GET',
            headers: {
                'Content-Type': 'application/octet-stream',
            },
        }
    );
--- a/examples/lsp/json.hpp
+++ b/examples/lsp/json.hpp
--- a/examples/lsp/CMakeLists.txt
+++ b/examples/lsp/CMakeLists.txt
@ -5,5 +5,5 @@ if (WHISPER_SDL2)
    include(DefaultTargetOptions)
-    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
+    target_link_libraries(${TARGET} PRIVATE common json_cpp common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -26,11 +26,11 @@ struct whisper_params {
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;
    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
    bool use_gpu       = true;
    bool flash_attn    = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -53,7 +53,7 @@ struct commandset {
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -69,11 +69,11 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else if (arg == "-fa"  || arg == "--flash-attn")    { params.flash_attn    = true; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else {
@ -100,16 +100,16 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -ng,        --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,        --flash-attn     [%-7s] flash attention\n",                             params.flash_attn ? "true" : "false");
    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "\n");
 }
-uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
+static uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
    using namespace std::chrono;
    uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
    uint64_t start_time = time_now;
@ -153,7 +153,7 @@ uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params &
    return time_now;
 }
-json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
+static json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
    std::vector<whisper_token> prompt_tokens;
    std::vector<float> pcmf32;
    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
@ -181,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
    wparams.n_threads        = params.n_threads;
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;
    wparams.suppress_non_speech_tokens = true;
    // run the transformer and a single decoding pass
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
@ -200,7 +199,7 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
-json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
+static json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
    struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
    std::vector<float> pcmf32;
    uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
@ -220,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
    wparams.n_threads        = params.n_threads;
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;
    // TODO: Do some time testing. Does an overly long prompt slow down processing?
    // Set up command sets/precompute prompts
@ -287,7 +285,7 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
    }
 }
-json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
+static json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
    // TODO: check for token collision
    struct commandset cs;
@ -327,7 +325,8 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
    commandset_list.push_back(cs);
    return json{{"index",index}};
 }
-json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
+
 static json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
    // whisper_state has the pertinent offsets, but there also seem to be a large
    // number of scratch buffers that would prevent rewinding context in a manner similar to llama
    // I'll give this a another pass once everything else is implemented,
@ -337,7 +336,8 @@ json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*para
            {"message", "Seeking is not yet supported."}
    };
 }
-json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
+
 static json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
    // See: https://www.jsonrpc.org/specification
    json id = body.at("id");
    try {
@ -377,7 +377,7 @@ json parse_job(const json &body, struct whisper_context * ctx, audio_async &audi
    }
 }
-void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+static void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    std::deque<json> jobqueue;
    std::vector<struct commandset> commandset_list;
    while (true) {
@ -436,7 +436,10 @@ int main(int argc, char ** argv) {
    // whisper init
    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
+
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    // init audio
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)
 include(DefaultTargetOptions)
-target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,10 +1,12 @@
 #include "common.h"
 #include "whisper.h"
 #include "grammar-parser.h"
 #include <cmath>
 #include <fstream>
 #include <cstdio>
 #include <regex>
 #include <string>
 #include <thread>
 #include <vector>
@ -14,36 +16,8 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
 const std::vector<std::string> k_colors = {
    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
 };
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false) {
    int64_t msec = t * 10;
    int64_t hr = msec / (1000 * 60 * 60);
    msec = msec - hr * (1000 * 60 * 60);
    int64_t min = msec / (1000 * 60);
    msec = msec - min * (1000 * 60);
    int64_t sec = msec / 1000;
    msec = msec - sec * 1000;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
    return std::string(buf);
 }
 int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }
 // helper function to replace substrings
-void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
        pos = s.find(search, pos);
        if (pos == std::string::npos) break;
@ -54,23 +28,25 @@ void replace_all(std::string & s, const std::string & search, const std::string
 // command-line parameters
 struct whisper_params {
-    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_threads     = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_processors =  1;
+    int32_t n_processors  = 1;
-    int32_t offset_t_ms  =  0;
+    int32_t offset_t_ms   = 0;
-    int32_t offset_n     =  0;
+    int32_t offset_n      = 0;
-    int32_t duration_ms  =  0;
+    int32_t duration_ms   = 0;
-    int32_t progress_step =  5;
+    int32_t progress_step = 5;
-    int32_t max_context  = -1;
+    int32_t max_context   = -1;
-    int32_t max_len      =  0;
+    int32_t max_len       = 0;
-    int32_t best_of      = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
+    int32_t best_of       = whisper_full_default_params(WHISPER_SAMPLING_GREEDY).greedy.best_of;
-    int32_t beam_size    = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
+    int32_t beam_size     = whisper_full_default_params(WHISPER_SAMPLING_BEAM_SEARCH).beam_search.beam_size;
-    int32_t audio_ctx   = 0;
+    int32_t audio_ctx     = 0;
-    float word_thold    =  0.01f;
+    float word_thold      =  0.01f;
-    float entropy_thold =  2.40f;
+    float entropy_thold   =  2.40f;
-    float logprob_thold = -1.00f;
+    float logprob_thold   = -1.00f;
    float grammar_penalty = 100.0f;
    float temperature     = 0.0f;
    float temperature_inc = 0.2f;
    bool speed_up        = false;
    bool debug_mode      = false;
    bool translate       = false;
    bool detect_language = false;
@ -93,24 +69,42 @@ struct whisper_params {
    bool no_timestamps   = false;
    bool log_score       = false;
    bool use_gpu         = true;
    bool flash_attn      = false;
    std::string language  = "en";
    std::string prompt;
    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    std::string model     = "models/ggml-base.en.bin";
    std::string grammar;
    std::string grammar_rule;
    // [TDRZ] speaker turn string
    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
    // A regular expression that matches tokens to suppress
    std::string suppress_regex;
    std::string openvino_encode_device = "CPU";
    std::string dtw = "";
    std::vector<std::string> fname_inp = {};
    std::vector<std::string> fname_out = {};
    grammar_parser::parse_state grammar_parsed;
 };
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static char * whisper_param_turn_lowercase(char * in){
    int string_len = strlen(in);
    for (int i = 0; i < string_len; i++){
        *(in+i) = tolower((unsigned char)*(in+i));
    }
    return in;
 }
 static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -137,11 +131,12 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-ac"   || arg == "--audio-context")   { params.audio_ctx       = std::stoi(argv[++i]); }
+        else if (arg == "-ac"   || arg == "--audio-ctx")       { params.audio_ctx       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
+        else if (arg == "-tp"   || arg == "--temperature")     { params.temperature     = std::stof(argv[++i]); }
        else if (arg == "-tpi"  || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@ -163,14 +158,20 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
-        else if (arg == "-l"    || arg == "--language")        { params.language        = argv[++i]; }
+        else if (arg == "-l"    || arg == "--language")        { params.language        = whisper_param_turn_lowercase(argv[++i]); }
        else if (arg == "-dl"   || arg == "--detect-language") { params.detect_language = true; }
        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
        else if (arg == "-f"    || arg == "--file")            { params.fname_inp.emplace_back(argv[++i]); }
        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
        else if (arg == "-dtw"  || arg == "--dtw")             { params.dtw             = argv[++i]; }
        else if (arg == "-ls"   || arg == "--log-score")       { params.log_score       = true; }
        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
        else if (arg == "-fa"   || arg == "--flash-attn")      { params.flash_attn      = true; }
        else if (                  arg == "--suppress-regex")  { params.suppress_regex  = argv[++i]; }
        else if (                  arg == "--grammar")         { params.grammar         = argv[++i]; }
        else if (                  arg == "--grammar-rule")    { params.grammar_rule    = argv[++i]; }
        else if (                  arg == "--grammar-penalty") { params.grammar_penalty = std::stof(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -181,7 +182,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    return true;
 }
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
+static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
@ -201,7 +202,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tp,       --temperature N     [%-7.2f] The sampling temperature, between 0 and 1\n",    params.temperature);
    fprintf(stderr, "  -tpi,      --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@ -224,12 +226,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
-    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
+    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt (max n_text_ctx/2 tokens)\n",       params.prompt.c_str());
    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
    fprintf(stderr, "  -dtw MODEL --dtw MODEL         [%-7s] compute token-level timestamps\n",                 params.dtw.c_str());
    fprintf(stderr, "  -ls,       --log-score         [%-7s] log best decoder scores of tokens\n",              params.log_score?"true":"false");
    fprintf(stderr, "  -ng,       --no-gpu            [%-7s] disable GPU\n",                                    params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,       --flash-attn        [%-7s] flash attention\n",                                params.flash_attn ? "true" : "false");
    fprintf(stderr, "  --suppress-regex REGEX         [%-7s] regular expression matching tokens to suppress\n", params.suppress_regex.c_str());
    fprintf(stderr, "  --grammar GRAMMAR              [%-7s] GBNF grammar to guide decoding\n",                 params.grammar.c_str());
    fprintf(stderr, "  --grammar-rule RULE            [%-7s] top-level GBNF grammar rule name\n",               params.grammar_rule.c_str());
    fprintf(stderr, "  --grammar-penalty N            [%-7.1f] scales down logits of nongrammar tokens\n",      params.grammar_penalty);
    fprintf(stderr, "\n");
 }
@ -240,12 +248,12 @@ struct whisper_print_user_data {
    int progress_prev;
 };
-std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
+static std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
    std::string speaker = "";
    const int64_t n_samples = pcmf32s[0].size();
-    const int64_t is0 = timestamp_to_sample(t0, n_samples);
+    const int64_t is0 = timestamp_to_sample(t0, n_samples, WHISPER_SAMPLE_RATE);
-    const int64_t is1 = timestamp_to_sample(t1, n_samples);
+    const int64_t is1 = timestamp_to_sample(t1, n_samples, WHISPER_SAMPLE_RATE);
    double energy0 = 0.0f;
    double energy1 = 0.0f;
@ -272,7 +280,8 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
    return speaker;
 }
-void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
+
 static void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
    int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
    int * progress_prev  = &(((whisper_print_user_data *) user_data)->progress_prev);
    if (progress >= *progress_prev + progress_step) {
@ -281,7 +290,7 @@ void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct wh
    }
 }
-void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
+static void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -350,7 +359,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
    }
 }
-bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -377,7 +386,7 @@ bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }
-bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -409,7 +418,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }
-bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -438,7 +447,7 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }
-char *escape_double_quotes_and_backslashes(const char *str) {
+static char * escape_double_quotes_and_backslashes(const char * str) {
    if (str == NULL) {
        return NULL;
    }
@ -451,7 +460,7 @@ char *escape_double_quotes_and_backslashes(const char *str) {
        }
    }
-    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
+    char * escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
    if (escaped == NULL) {
        return NULL;
    }
@ -469,7 +478,39 @@ char *escape_double_quotes_and_backslashes(const char *str) {
    return escaped;
 }
-bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+// double quote should be escaped by another double quote. (rfc4180)
 static char * escape_double_quotes_in_csv(const char * str) {
    if (str == NULL) {
        return NULL;
    }
    size_t escaped_length = strlen(str) + 1;
    for (size_t i = 0; str[i] != '\0'; i++) {
        if (str[i] == '"') {
            escaped_length++;
        }
    }
    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
    if (escaped == NULL) {
        return NULL;
    }
    size_t pos = 0;
    for (size_t i = 0; str[i] != '\0'; i++) {
        if (str[i] == '"') {
            escaped[pos++] = '"';
        }
        escaped[pos++] = str[i];
    }
    // no need to set zero due to calloc() being used prior
    return escaped;
 }
 static bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -490,7 +531,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-        char * text_escaped = escape_double_quotes_and_backslashes(text);
+        char * text_escaped = escape_double_quotes_in_csv(text);
        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
        fout << 10 * t0 << "," << 10 * t1 << ",";
@ -504,7 +545,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }
-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
+static bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
    std::ofstream fout(fname);
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -523,7 +564,7 @@ bool output_score(struct whisper_context * ctx, const char * fname, const whispe
    return true;
 }
-bool output_json(
+static bool output_json(
             struct whisper_context * ctx,
                         const char * fname,
               const whisper_params & params,
@ -669,7 +710,8 @@ bool output_json(
                                    times_o(token.t0, token.t1, false);
                                }
                                value_i("id", token.id, false);
-                                value_f("p", token.p, true);
+                                value_f("p", token.p, false);
                                value_f("t_dtw", token.t_dtw, true);
                            end_obj(j == (n - 1));
                        }
                        end_arr(!params.diarize && !params.tinydiarize);
@ -693,7 +735,7 @@ bool output_json(
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
+static bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -818,7 +860,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
    return true;
 }
-bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+static bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -859,16 +901,58 @@ bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_
 }
-void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
+static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
 int main(int argc, char ** argv) {
    whisper_params params;
    // If the only argument starts with "@", read arguments line-by-line
    // from the given file.
    std::vector<std::string> vec_args;
    if (argc == 2 && argv != nullptr && argv[1] != nullptr && argv[1][0] == '@') {
        // Save the name of the executable.
        vec_args.push_back(argv[0]);
        // Open the response file.
        char const * rspfile = argv[1] + sizeof(char);
        std::ifstream fin(rspfile);
        if (fin.is_open() == false) {
            fprintf(stderr, "error: response file '%s' not found\n", rspfile);
            return 1;
        }
        // Read the entire response file.
        std::string line;
        while (std::getline(fin, line)) {
            vec_args.push_back(line);
        }
        // Use the contents of the response file as the command-line arguments.
        argc = static_cast<int>(vec_args.size());
        argv = static_cast<char **>(alloca(argc * sizeof (char *)));
        for (int i = 0; i < argc; ++i) {
            argv[i] = const_cast<char *>(vec_args[i].c_str());
        }
    }
    if (whisper_params_parse(argc, argv, params) == false) {
        whisper_print_usage(argc, argv, params);
        return 1;
    }
    // remove non-existent files
    for (auto it = params.fname_inp.begin(); it != params.fname_inp.end();) {
        const auto fname_inp = it->c_str();
        if (*it != "-" && !is_file_exist(fname_inp)) {
            fprintf(stderr, "error: input file not found '%s'\n", fname_inp);
            it = params.fname_inp.erase(it);
            continue;
        }
        it++;
    }
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
@ -894,7 +978,31 @@ int main(int argc, char ** argv) {
    // whisper init
    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
+
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    if (!params.dtw.empty()) {
        cparams.dtw_token_timestamps = true;
        cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE;
        if (params.dtw == "tiny")      cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY;
        if (params.dtw == "tiny.en")   cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN;
        if (params.dtw == "base")      cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
        if (params.dtw == "base.en")   cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN;
        if (params.dtw == "small")     cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL;
        if (params.dtw == "small.en")  cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN;
        if (params.dtw == "medium")    cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM;
        if (params.dtw == "medium.en") cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN;
        if (params.dtw == "large.v1")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
        if (params.dtw == "large.v2")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
        if (params.dtw == "large.v3")  cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
        if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
            fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
            return 3;
        }
    }
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -906,6 +1014,29 @@ int main(int argc, char ** argv) {
    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
    if (!params.grammar.empty()) {
        auto & grammar = params.grammar_parsed;
        if (is_file_exist(params.grammar.c_str())) {
            // read grammar from file
            std::ifstream ifs(params.grammar.c_str());
            const std::string txt = std::string((std::istreambuf_iterator<char>(ifs)), std::istreambuf_iterator<char>());
            grammar = grammar_parser::parse(txt.c_str());
        } else {
            // read grammar from string
            grammar = grammar_parser::parse(params.grammar.c_str());
        }
        // will be empty (default) if there are parse errors
        if (grammar.rules.empty()) {
            fprintf(stderr, "error: failed to parse grammar \"%s\"\n", params.grammar.c_str());
            return 4;
        } else {
            fprintf(stderr, "%s: grammar:\n", __func__);
            grammar_parser::print_grammar(stderr, grammar);
            fprintf(stderr, "\n");
        }
    }
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
 		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -952,7 +1083,8 @@ int main(int argc, char ** argv) {
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
+            const bool use_grammar = (!params.grammar_parsed.rules.empty() && !params.grammar_rule.empty());
            wparams.strategy = (params.beam_size > 1 || use_grammar) ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
            wparams.print_realtime   = false;
            wparams.print_progress   = params.print_progress;
@ -972,17 +1104,20 @@ int main(int argc, char ** argv) {
            wparams.split_on_word    = params.split_on_word;
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;
            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
            wparams.suppress_regex   = params.suppress_regex.empty() ? nullptr : params.suppress_regex.c_str();
            wparams.initial_prompt   = params.prompt.c_str();
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;
-            wparams.temperature_inc  = params.no_fallback ? 0.0f : wparams.temperature_inc;
+            wparams.temperature_inc  = params.no_fallback ? 0.0f : params.temperature_inc;
            wparams.temperature      = params.temperature;
            wparams.entropy_thold    = params.entropy_thold;
            wparams.logprob_thold    = params.logprob_thold;
@ -990,6 +1125,20 @@ int main(int argc, char ** argv) {
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
            const auto & grammar_parsed = params.grammar_parsed;
            auto grammar_rules = grammar_parsed.c_rules();
            if (use_grammar) {
                if (grammar_parsed.symbol_ids.find(params.grammar_rule) == grammar_parsed.symbol_ids.end()) {
                    fprintf(stderr, "%s: warning: grammar rule '%s' not found - skipping grammar sampling\n", __func__, params.grammar_rule.c_str());
                } else {
                    wparams.grammar_rules = grammar_rules.data();
                    wparams.n_grammar_rules = grammar_rules.size();
                    wparams.i_start_rule = grammar_parsed.symbol_ids.at(params.grammar_rule);
                    wparams.grammar_penalty = params.grammar_penalty;
                }
            }
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
@ -1086,7 +1235,9 @@ int main(int argc, char ** argv) {
        }
    }
-    whisper_print_timings(ctx);
+    if (!params.no_prints) {
        whisper_print_timings(ctx);
    }
    whisper_free(ctx);
    return 0;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -36,7 +36,7 @@ struct whisper_filters {
 };
 // quantize a model
-bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+static bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
    gpt_vocab vocab;
    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,9 +1,9 @@
 set(TARGET server)
-add_executable(${TARGET} server.cpp httplib.h json.hpp)
+add_executable(${TARGET} server.cpp httplib.h)
 include(DefaultTargetOptions)
-target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common json_cpp whisper ${CMAKE_THREAD_LIBS_INIT})
 if (WIN32)
    target_link_libraries(${TARGET} PRIVATE ws2_32)
--- a/examples/server/json.hpp
+++ b/examples/server/json.hpp
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -22,13 +22,6 @@ using json = nlohmann::ordered_json;
 namespace {
 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
 const std::vector<std::string> k_colors = {
    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
 };
 // output formats
 const std::string json_format   = "json";
 const std::string text_format   = "text";
@ -68,7 +61,6 @@ struct whisper_params {
    float temperature     =  0.00f;
    float temperature_inc =  0.20f;
    bool speed_up        = false;
    bool debug_mode      = false;
    bool translate       = false;
    bool detect_language = false;
@ -82,6 +74,7 @@ struct whisper_params {
    bool print_progress  = false;
    bool no_timestamps   = false;
    bool use_gpu         = true;
    bool flash_attn      = false;
    std::string language        = "en";
    std::string prompt          = "";
@ -94,35 +87,10 @@ struct whisper_params {
    std::string tdrz_speaker_turn = " [SPEAKER_TURN]"; // TODO: set from command line
    std::string openvino_encode_device = "CPU";
    std::string dtw = "";
 };
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false) {
    int64_t msec = t * 10;
    int64_t hr = msec / (1000 * 60 * 60);
    msec = msec - hr * (1000 * 60 * 60);
    int64_t min = msec / (1000 * 60);
    msec = msec - min * (1000 * 60);
    int64_t sec = msec / 1000;
    msec = msec - sec * 1000;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
    return std::string(buf);
 }
 int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }
 bool is_file_exist(const char *fileName)
 {
    std::ifstream infile(fileName);
    return infile.good();
 }
 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] \n", argv[0]);
@ -143,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@ -160,6 +127,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n",  params.openvino_encode_device.c_str());
    // server params
    fprintf(stderr, "  -dtw MODEL --dtw MODEL         [%-7s] compute token-level timestamps\n", params.dtw.c_str());
    fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
    fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
    fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
@ -185,11 +153,10 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (arg == "-ml"   || arg == "--max-len")         { params.max_len         = std::stoi(argv[++i]); }
        else if (arg == "-bo"   || arg == "--best-of")         { params.best_of         = std::stoi(argv[++i]); }
        else if (arg == "-bs"   || arg == "--beam-size")       { params.beam_size       = std::stoi(argv[++i]); }
-        else if (arg == "-ac"   || arg == "--audio-context")   { params.audio_ctx       = std::stoi(argv[++i]); }
+        else if (arg == "-ac"   || arg == "--audio-ctx")       { params.audio_ctx       = std::stoi(argv[++i]); }
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@ -207,7 +174,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--prompt")          { params.prompt          = argv[++i]; }
        else if (arg == "-m"    || arg == "--model")           { params.model           = argv[++i]; }
        else if (arg == "-oved" || arg == "--ov-e-device")     { params.openvino_encode_device = argv[++i]; }
        else if (arg == "-dtw"  || arg == "--dtw")             { params.dtw             = argv[++i]; }
        else if (arg == "-ng"   || arg == "--no-gpu")          { params.use_gpu         = false; }
        else if (arg == "-fa"   || arg == "--flash-attn")      { params.flash_attn      = true; }
        // server params
        else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
        else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
@ -274,8 +243,8 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
    std::string speaker = "";
    const int64_t n_samples = pcmf32s[0].size();
-    const int64_t is0 = timestamp_to_sample(t0, n_samples);
+    const int64_t is0 = timestamp_to_sample(t0, n_samples, WHISPER_SAMPLE_RATE);
-    const int64_t is1 = timestamp_to_sample(t1, n_samples);
+    const int64_t is1 = timestamp_to_sample(t1, n_samples, WHISPER_SAMPLE_RATE);
    double energy0 = 0.0f;
    double energy1 = 0.0f;
@ -532,7 +501,53 @@ int main(int argc, char ** argv) {
    }
    // whisper init
    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
+
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    if (!params.dtw.empty()) {
        cparams.dtw_token_timestamps = true;
        cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE;
        if (params.dtw == "tiny") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY;
        }
        if (params.dtw == "tiny.en") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN;
        }
        if (params.dtw == "base") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
        }
        if (params.dtw == "base.en") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN;
        }
        if (params.dtw == "small") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL;
        }
        if (params.dtw == "small.en") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN;
        }
        if (params.dtw == "medium") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM;
        }
        if (params.dtw == "medium.en") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN;
        }
        if (params.dtw == "large.v1") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
        }
        if (params.dtw == "large.v2") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
        }
        if (params.dtw == "large.v3") {
            cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
        }
        if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
            fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());
            return 3;
        }
    }
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -629,7 +644,7 @@ int main(int argc, char ** argv) {
        return false;
    });
-    svr.Options(sparams.request_path + "/inference", [&](const Request &req, Response &res){
+    svr.Options(sparams.request_path + "/inference", [&](const Request &, Response &){
    });
    svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
@ -750,7 +765,6 @@ int main(int argc, char ** argv) {
            wparams.split_on_word    = params.split_on_word;
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;
            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
@ -818,7 +832,7 @@ int main(int argc, char ** argv) {
        if (params.response_format == text_format)
        {
            std::string results = output_str(ctx, params, pcmf32s);
-            res.set_content(results.c_str(), "text/html");
+            res.set_content(results.c_str(), "text/html; charset=utf-8");
        }
        else if (params.response_format == srt_format)
        {
@ -899,6 +913,7 @@ int main(int argc, char ** argv) {
                    if (!params.no_timestamps) {
                        word["start"] = token.t0 * 0.01;
                        word["end"] = token.t1 * 0.01;
                        word["t_dtw"] = token.t_dtw;
                    }
                    word["probability"] = token.p;
                    total_logprob += token.plog;
@ -928,7 +943,7 @@ int main(int argc, char ** argv) {
                            "application/json");
        }
-        // reset params to thier defaults
+        // reset params to their defaults
        params = default_params;
    });
    svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -103,11 +103,11 @@ void stream_main(size_t index) {
            {
                const int n_segments = whisper_full_n_segments(ctx);
-                for (int i = n_segments - 1; i < n_segments; ++i) {
+                if (n_segments > 0) {
-                    const char * text = whisper_full_get_segment_text(ctx, i);
+                    const char * text = whisper_full_get_segment_text(ctx, n_segments - 1);
-                    const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                    const int64_t t0 = whisper_full_get_segment_t0(ctx, n_segments - 1);
-                    const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+                    const int64_t t1 = whisper_full_get_segment_t1(ctx, n_segments - 1);
                    printf("transcribed: %s\n", text);
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -30,9 +30,13 @@ a transcription block that is suitable for parsing.
 The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
-# Install SDL2 on Linux
+# Install SDL2
 # On Debian based linux distributions:
 sudo apt-get install libsdl2-dev
 # On Fedora Linux:
 sudo dnf install SDL2 SDL2-devel
 # Install SDL2 on Mac OS
 brew install sdl2
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -14,20 +14,6 @@
 #include <fstream>
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t) {
    int64_t sec = t/100;
    int64_t msec = t - sec*100;
    int64_t min = sec/60;
    sec = sec - min*60;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
    return std::string(buf);
 }
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
@ -41,7 +27,6 @@ struct whisper_params {
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;
    bool speed_up      = false;
    bool translate     = false;
    bool no_fallback   = false;
    bool print_special = false;
@ -50,6 +35,7 @@ struct whisper_params {
    bool tinydiarize   = false;
    bool save_audio    = false; // save audio to wav file
    bool use_gpu       = true;
    bool flash_attn    = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
@ -58,7 +44,7 @@ struct whisper_params {
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -75,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"   || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth"  || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth"  || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-nf"   || arg == "--no-fallback")   { params.no_fallback   = true; }
        else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
@ -86,6 +71,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-tdrz" || arg == "--tinydiarize")   { params.tinydiarize   = true; }
        else if (arg == "-sa"   || arg == "--save-audio")    { params.save_audio    = true; }
        else if (arg == "-ng"   || arg == "--no-gpu")        { params.use_gpu       = false; }
        else if (arg == "-fa"   || arg == "--flash-attn")    { params.flash_attn    = true; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -112,7 +98,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",           params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                   params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -nf,      --no-fallback   [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
@ -123,6 +108,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -tdrz,    --tinydiarize   [%-7s] enable tinydiarize (requires a tdrz model)\n",     params.tinydiarize ? "true" : "false");
    fprintf(stderr, "  -sa,      --save-audio    [%-7s] save the recorded audio to a file\n",              params.save_audio ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu        [%-7s] disable GPU inference\n",                          params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,      --flash-attn    [%-7s] flash attention during inference\n",               params.flash_attn ? "true" : "false");
    fprintf(stderr, "\n");
 }
@ -167,7 +153,9 @@ int main(int argc, char ** argv) {
    }
    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
+
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
@ -323,7 +311,6 @@ int main(int argc, char ** argv) {
            wparams.n_threads        = params.n_threads;
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;
            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
@ -372,7 +359,7 @@ int main(int argc, char ** argv) {
                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-                        std::string output = "[" + to_timestamp(t0) + " --> " + to_timestamp(t1) + "]  " + text;
+                        std::string output = "[" + to_timestamp(t0, false) + " --> " + to_timestamp(t1, false) + "]  " + text;
                        if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
                            output += " [SPEAKER_TURN]";
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@ -0,0 +1,9 @@
 #  MIT license
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 set(TARGET ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@ -0,0 +1,47 @@
 # llama.cpp/example/sycl
 This example program provide the tools for llama.cpp for SYCL on Intel GPU.
 ## Tool
 |Tool Name| Function|Status|
 |-|-|-|
 |ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
 ### ls-sycl-device
 List all SYCL devices with ID, compute capability, max work group size, ect.
 1. Build the llama.cpp for SYCL for all targets.
 2. Enable oneAPI running environment
 ```
 source /opt/intel/oneapi/setvars.sh
 ```
 3. Execute
 ```
 ./build/bin/ls-sycl-device
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@ -0,0 +1,19 @@
 #  MIT license
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
 #for FP16
 #cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
 #for FP32
 cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main only
 #cmake --build . --config Release --target main
 #build all binary
 cmake --build . --config Release -v
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@ -0,0 +1,11 @@
 /*MIT license
  Copyright (C) 2024 Intel Corporation
  SPDX-License-Identifier: MIT
 */
 #include "ggml-sycl.h"
 int main(int argc, char ** argv) {
    ggml_backend_sycl_print_sycl_devices();
    return 0;
 }
--- a/examples/sycl/run-whisper.sh
+++ b/examples/sycl/run-whisper.sh
@ -0,0 +1,17 @@
 #!/bin/bash
 #  MIT license
 #  Copyright (C) 2024 Intel Corporation
 #  SPDX-License-Identifier: MIT
 INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 if [ $# -gt 0 ]; then
    export GGML_SYCL_DEVICE=$1
 else
    export GGML_SYCL_DEVICE=0
 fi
 echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
 #export GGML_SYCL_DEBUG=1
 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
--- a/examples/talk-llama/.gitignore
+++ b/examples/talk-llama/.gitignore
@ -1 +1,2 @@
 audio.mp3
 to_speak.txt
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,7 +1,7 @@
 if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
-    add_executable(${TARGET} talk-llama.cpp llama.cpp)
+    add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    if (WHISPER_CLBLAST)
--- a/examples/talk-llama/README.md
+++ b/examples/talk-llama/README.md
@ -15,9 +15,13 @@ https://github.com/ggerganov/whisper.cpp/assets/1991296/d97a3788-bf2a-4756-9a43-
 The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
-# Install SDL2 on Linux
+# Install SDL2
 # On Debian based linux distributions:
 sudo apt-get install libsdl2-dev
 # On Fedora Linux:
 sudo dnf install SDL2 SDL2-devel
 # Install SDL2 on Mac OS
 brew install sdl2
--- a/examples/talk-llama/eleven-labs.py
+++ b/examples/talk-llama/eleven-labs.py
@ -1,20 +1,80 @@
 import sys
-import importlib.util
+import argparse
 import textwrap
-if importlib.util.find_spec("elevenlabs") is None:
+parser = argparse.ArgumentParser(add_help=False,
-    print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
+    formatter_class=argparse.RawTextHelpFormatter)
 parser.add_argument("-q", "--quick", action="store_true",
    help="skip checking the required library")
 modes = parser.add_argument_group("action")
 modes.add_argument("inputfile", metavar="TEXTFILE",
    nargs='?', type=argparse.FileType(), default=sys.stdin,
    help="read the text file (default: stdin)")
 modes.add_argument("-l", "--list", action="store_true",
    help="show the list of voices and exit")
 modes.add_argument("-h", "--help", action="help",
    help="show this help and exit")
 selopts = parser.add_argument_group("voice selection")
 selmodes = selopts.add_mutually_exclusive_group()
 selmodes.add_argument("-n", "--name",
    default="Arnold",
    help="get a voice object by name (default: Arnold)")
 selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
    help="get a voice object by number (see --list)")
 selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
    default=["use case=narration"],
    help=textwrap.dedent('''\
        filter voices by labels (default: "use case=narration")
        this option can be used multiple times
        filtering will be disabled if the first -f has no "=" (e.g. -f "any")
        '''))
 outmodes = parser.add_argument_group("output")
 outgroup = outmodes.add_mutually_exclusive_group()
 outgroup.add_argument("-s", "--save", metavar="FILE",
    default="audio.mp3",
    help="save the TTS to a file (default: audio.mp3)")
 outgroup.add_argument("-p", "--play", action="store_true",
    help="play the TTS with ffplay")
 args = parser.parse_args()
 if not args.quick:
    import importlib.util
    if importlib.util.find_spec("elevenlabs") is None:
        print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
        sys.exit()
 from elevenlabs import voices, generate, play, save
 if args.filter and "=" in args.filter[0]:
    voicelist = voices()
    for f in args.filter:
        label, value = f.split("=")
        voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
    voicelist = list(voicelist)
 else:
    voicelist = list(voices())
 if args.list:
    for i, v in enumerate(voicelist):
        print(str(i) + ": " + v.name + " " + str(v.labels))
    sys.exit()
-from elevenlabs import generate, play, save
+if args.voice:
    voice = voicelist[args.voice % len(voicelist)]
 else:
    voice = args.name
    # if -n should consult -f, use the following
    #voice = next(x for x in voicelist if x.name == args.name)
 # Get a Voice object, by name or UUID
 voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
 # Generate the TTS
 audio = generate(
-  text=str(sys.argv[2:]),
+    text=str(args.inputfile.read()),
-  voice=voice
+    voice=voice
 )
-
+if args.play:
-# Save the TTS to a file
+    play(audio)
-save(audio, "audio.mp3") 
+else:
    save(audio, args.save) 
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -37,9 +37,13 @@
 #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
 #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
 #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
-#define LLAMA_SESSION_VERSION 4
+#define LLAMA_SESSION_VERSION 6
 #define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 #define LLAMA_STATE_SEQ_VERSION 1
 #ifdef __cplusplus
 extern "C" {
@ -59,12 +63,42 @@ extern "C" {
    typedef int32_t llama_seq_id;
    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
+        LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab
-        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_SPM  = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
-        LLAMA_VOCAB_TYPE_WPM = 2, // WordPiece
+        LLAMA_VOCAB_TYPE_BPE  = 2, // GPT-2 tokenizer based on byte-level BPE
        LLAMA_VOCAB_TYPE_WPM  = 3, // BERT tokenizer based on WordPiece
    };
-    enum llama_token_type {
+    // pre-tokenization types
    enum llama_vocab_pre_type {
        LLAMA_VOCAB_PRE_TYPE_DEFAULT        = 0,
        LLAMA_VOCAB_PRE_TYPE_LLAMA3         = 1,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM   = 2,
        LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
        LLAMA_VOCAB_PRE_TYPE_FALCON         = 4,
        LLAMA_VOCAB_PRE_TYPE_MPT            = 5,
        LLAMA_VOCAB_PRE_TYPE_STARCODER      = 6,
        LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
        LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
        LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
        LLAMA_VOCAB_PRE_TYPE_STABLELM2      = 10,
        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 11,
        LLAMA_VOCAB_PRE_TYPE_OLMO           = 12,
        LLAMA_VOCAB_PRE_TYPE_DBRX           = 13,
        LLAMA_VOCAB_PRE_TYPE_SMAUG          = 14,
        LLAMA_VOCAB_PRE_TYPE_PORO           = 15,
    };
    // note: these values should be synchronized with ggml_rope
    // TODO: maybe move this enum to ggml.h (ggml_rope_type)
    enum llama_rope_type {
        LLAMA_ROPE_TYPE_NONE = -1,
        LLAMA_ROPE_TYPE_NORM =  0,
        LLAMA_ROPE_TYPE_NEOX =  2,
        LLAMA_ROPE_TYPE_GLM  =  4,
    };
    enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
        LLAMA_TOKEN_TYPE_UNDEFINED    = 0,
        LLAMA_TOKEN_TYPE_NORMAL       = 1,
        LLAMA_TOKEN_TYPE_UNKNOWN      = 2,
@ -74,6 +108,20 @@ extern "C" {
        LLAMA_TOKEN_TYPE_BYTE         = 6,
    };
    enum llama_token_attr {
        LLAMA_TOKEN_ATTR_UNDEFINED    = 0,
        LLAMA_TOKEN_ATTR_UNKNOWN      = 1 << 0,
        LLAMA_TOKEN_ATTR_UNUSED       = 1 << 1,
        LLAMA_TOKEN_ATTR_NORMAL       = 1 << 2,
        LLAMA_TOKEN_ATTR_CONTROL      = 1 << 3,  // SPECIAL?
        LLAMA_TOKEN_ATTR_USER_DEFINED = 1 << 4,
        LLAMA_TOKEN_ATTR_BYTE         = 1 << 5,
        LLAMA_TOKEN_ATTR_NORMALIZED   = 1 << 6,
        LLAMA_TOKEN_ATTR_LSTRIP       = 1 << 7,
        LLAMA_TOKEN_ATTR_RSTRIP       = 1 << 8,
        LLAMA_TOKEN_ATTR_SINGLE_WORD  = 1 << 9,
    };
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32              = 0,
@ -98,24 +146,41 @@ extern "C" {
        LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q3_K_XS       = 22, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
        LLAMA_FTYPE_MOSTLY_BF16          = 32, // except 1d tensors
        LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
    };
    enum llama_rope_scaling_type {
-        LLAMA_ROPE_SCALING_UNSPECIFIED = -1,
+        LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED = -1,
-        LLAMA_ROPE_SCALING_NONE        = 0,
+        LLAMA_ROPE_SCALING_TYPE_NONE        = 0,
-        LLAMA_ROPE_SCALING_LINEAR      = 1,
+        LLAMA_ROPE_SCALING_TYPE_LINEAR      = 1,
-        LLAMA_ROPE_SCALING_YARN        = 2,
+        LLAMA_ROPE_SCALING_TYPE_YARN        = 2,
-        LLAMA_ROPE_SCALING_MAX_VALUE   = LLAMA_ROPE_SCALING_YARN,
+        LLAMA_ROPE_SCALING_TYPE_MAX_VALUE   = LLAMA_ROPE_SCALING_TYPE_YARN,
    };
    enum llama_pooling_type {
        LLAMA_POOLING_TYPE_UNSPECIFIED = -1,
        LLAMA_POOLING_TYPE_NONE = 0,
        LLAMA_POOLING_TYPE_MEAN = 1,
        LLAMA_POOLING_TYPE_CLS  = 2,
        LLAMA_POOLING_TYPE_LAST = 3,
    };
    enum llama_split_mode {
-        LLAMA_SPLIT_NONE    = 0, // single GPU
+        LLAMA_SPLIT_MODE_NONE    = 0, // single GPU
-        LLAMA_SPLIT_LAYER   = 1, // split layers and KV across GPUs
+        LLAMA_SPLIT_MODE_LAYER   = 1, // split layers and KV across GPUs
-        LLAMA_SPLIT_ROW     = 2, // split rows across GPUs
+        LLAMA_SPLIT_MODE_ROW     = 2, // split rows across GPUs
    };
    typedef struct llama_token_data {
@ -130,7 +195,7 @@ extern "C" {
        bool sorted;
    } llama_token_data_array;
-    typedef bool (*llama_progress_callback)(float progress, void *ctx);
+    typedef bool (*llama_progress_callback)(float progress, void * user_data);
    // Input data for llama_decode
    // A llama_batch object can contain input about one or many sequences
@ -140,7 +205,7 @@ extern "C" {
    // - embd   : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
    // - pos    : the positions of the respective token in the sequence
    // - seq_id : the sequence to which the respective token belongs
-    // - logits : if zero, the logits for the respective token will not be output
+    // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
    //
    typedef struct llama_batch {
        int32_t n_tokens;
@ -150,7 +215,7 @@ extern "C" {
        llama_pos    *  pos;
        int32_t      *  n_seq_id;
        llama_seq_id ** seq_id;
-        int8_t       *  logits;
+        int8_t       *  logits; // TODO: rename this to "output"
        // NOTE: helpers for smooth API transition - can be deprecated in the future
        //       for future-proof code, use the above fields instead and ignore everything below
@ -163,18 +228,22 @@ extern "C" {
    } llama_batch;
    enum llama_model_kv_override_type {
-        LLAMA_KV_OVERRIDE_INT,
+        LLAMA_KV_OVERRIDE_TYPE_INT,
-        LLAMA_KV_OVERRIDE_FLOAT,
+        LLAMA_KV_OVERRIDE_TYPE_FLOAT,
-        LLAMA_KV_OVERRIDE_BOOL,
+        LLAMA_KV_OVERRIDE_TYPE_BOOL,
        LLAMA_KV_OVERRIDE_TYPE_STR,
    };
    struct llama_model_kv_override {
        char key[128];
        enum llama_model_kv_override_type tag;
        char key[128];
        union {
-            int64_t int_value;
+            int64_t val_i64;
-            double float_value;
+            double  val_f64;
-            bool bool_value;
+            bool    val_bool;
            char    val_str[128];
        };
    };
@ -191,6 +260,9 @@ extern "C" {
        // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
        const float * tensor_split;
        // comma separated list of RPC servers to use for offloading
        const char * rpc_servers;
        // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
        // If the provided progress_callback returns true, model loading continues.
        // If it returns false, model loading is immediately aborted.
@ -203,18 +275,25 @@ extern "C" {
        const struct llama_model_kv_override * kv_overrides;
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool vocab_only; // only load the vocabulary, no weights
+        bool vocab_only;    // only load the vocabulary, no weights
-        bool use_mmap;   // use mmap if possible
+        bool use_mmap;      // use mmap if possible
-        bool use_mlock;  // force system to keep model in RAM
+        bool use_mlock;     // force system to keep model in RAM
        bool check_tensors; // validate model tensor data
    };
    // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
    //       https://github.com/ggerganov/llama.cpp/pull/7544
    struct llama_context_params {
        uint32_t seed;              // RNG seed, -1 for random
        uint32_t n_ctx;             // text context, 0 = from model
-        uint32_t n_batch;           // prompt processing maximum batch size
+        uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
        uint32_t n_ubatch;          // physical maximum batch size
        uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
        uint32_t n_threads;         // number of threads to use for generation
        uint32_t n_threads_batch;   // number of threads to use for batch processing
-        int32_t  rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
+
        enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
        enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
        float    rope_freq_base;   // RoPE base frequency, 0 = from model
@ -224,29 +303,40 @@ extern "C" {
        float    yarn_beta_fast;   // YaRN low correction dim
        float    yarn_beta_slow;   // YaRN high correction dim
        uint32_t yarn_orig_ctx;    // YaRN original context size
        float    defrag_thold;     // defragment the KV cache if holes/size > thold, < 0 disabled (default)
        ggml_backend_sched_eval_callback cb_eval;
        void * cb_eval_user_data;
-        enum ggml_type type_k; // data type for K cache
+        enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
-        enum ggml_type type_v; // data type for V cache
+        enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
        // Keep the booleans together to avoid misalignment during copy-by-value.
-        bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
+        bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
-        bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
+        bool embeddings;  // if true, extract embeddings (together with logits)
        bool embedding;   // embedding mode only
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        // Abort callback
        // if it returns true, execution of llama_decode() will be aborted
        // currently works only with CPU execution
        ggml_abort_callback abort_callback;
        void *              abort_callback_data;
    };
    // model quantization parameters
    typedef struct llama_model_quantize_params {
-        int32_t nthread;             // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        int32_t nthread;                     // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;      // quantize to this llama_ftype
+        enum llama_ftype ftype;              // quantize to this llama_ftype
-        bool allow_requantize;       // allow quantizing non-f32/f16 tensors
+        enum ggml_type output_tensor_type;   // output tensor type
-        bool quantize_output_tensor; // quantize output.weight
+        enum ggml_type token_embedding_type; // itoken embeddings tensor type
-        bool only_copy;              // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool allow_requantize;               // allow quantizing non-f32/f16 tensors
-        bool pure;                   // disable k-quant mixtures and quantize all tensors to the same type
+        bool quantize_output_tensor;         // quantize output.weight
-        void * imatrix;              // pointer to importance matrix data
+        bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
        bool pure;                           // quantize all tensors to the default type
        bool keep_split;                     // quantize to the same number of shards
        void * imatrix;                      // pointer to importance matrix data
        void * kv_overrides;                 // pointer to vector containing overrides
    } llama_model_quantize_params;
    // grammar types
@ -276,6 +366,9 @@ extern "C" {
        // modifies a preceding LLAMA_GRETYPE_CHAR or
        // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
        LLAMA_GRETYPE_CHAR_ALT       = 6,
        // any character (.)
        LLAMA_GRETYPE_CHAR_ANY       = 7,
    };
    typedef struct llama_grammar_element {
@ -297,6 +390,12 @@ extern "C" {
        int32_t n_eval;
    };
    // used in chat template
    typedef struct llama_chat_message {
        const char * role;
        const char * content;
    } llama_chat_message;
    // Helpers for getting default parameters
    LLAMA_API struct llama_model_params llama_model_default_params(void);
    LLAMA_API struct llama_context_params llama_context_default_params(void);
@ -305,7 +404,10 @@ extern "C" {
    // Initialize the llama + ggml backend
    // If numa is true, use NUMA optimizations
    // Call once at the start of the program
-    LLAMA_API void llama_backend_init(bool numa);
+    LLAMA_API void llama_backend_init(void);
    //optional:
    LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
    // Call once at the end of the program - currently only used for MPI
    LLAMA_API void llama_backend_free(void);
@ -331,19 +433,22 @@ extern "C" {
    LLAMA_API bool llama_supports_mlock      (void);
    LLAMA_API bool llama_supports_gpu_offload(void);
    LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
    LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
    LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
    LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
-    LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
+    LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
    LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
    LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
    LLAMA_API int32_t llama_n_vocab    (const struct llama_model * model);
    LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
    LLAMA_API int32_t llama_n_embd     (const struct llama_model * model);
    LLAMA_API int32_t llama_n_layer    (const struct llama_model * model);
    // Get the model's RoPE frequency scaling factor
    LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ -389,20 +494,26 @@ extern "C" {
    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
    // will be applied on top of the previous one
    // Returns 0 on success
    LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                           float   scale,
                      const char * path_base_model,
                         int32_t   n_threads),
            "use llama_model_apply_lora_from_file instead");
    LLAMA_API int32_t llama_model_apply_lora_from_file(
            const struct llama_model * model,
-                      const char * path_lora,
+                          const char * path_lora,
-                           float   scale,
+                               float   scale,
-                      const char * path_base_model,
+                          const char * path_base_model,
-                         int32_t   n_threads);
+                             int32_t   n_threads);
    // Apply a loaded control vector to a llama_context, or if data is NULL, clear
    // the currently loaded vector.
    // n_embd should be the size of a single layer's control, and data should point
    // to an n_embd x n_layers buffer starting from layer 1.
    // il_start and il_end are the layer range the vector should apply to (both inclusive)
    // See llama_control_vector_load in common to load a control vector.
    LLAMA_API int32_t llama_control_vector_apply(
            struct llama_context * lctx,
                     const float * data,
                          size_t   len,
                         int32_t   n_embd,
                         int32_t   il_start,
                         int32_t   il_end);
    //
    // KV cache
@ -423,7 +534,7 @@ extern "C" {
        // Maximum number of sequences that can exist in a cell. It's not an error
        // if there are more sequences in a cell than this value, however they will
        // not be visible in the view cells_sequences.
-        int32_t n_max_seq;
+        int32_t n_seq_max;
        // Number of tokens in the cache. For example, if there are two populated
        // cells, the first with 1 sequence id in it and the second with 2 sequence
@ -443,12 +554,12 @@ extern "C" {
        // Information for an individual cell.
        struct llama_kv_cache_view_cell * cells;
-        // The sequences for each cell. There will be n_max_seq items per cell.
+        // The sequences for each cell. There will be n_seq_max items per cell.
        llama_seq_id * cells_sequences;
    };
    // Create an empty KV cache view. (use only for debugging purposes)
-    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
+    LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_seq_max);
    // Free a KV cache view. (use only for debugging purposes)
    LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
@ -463,15 +574,16 @@ extern "C" {
    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
-    // Clear the KV cache
+    // Clear the KV cache - both cell info is erased and KV data is zeroed
    LLAMA_API void llama_kv_cache_clear(
            struct llama_context * ctx);
    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
    // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_cache_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -494,10 +606,12 @@ extern "C" {
                    llama_seq_id   seq_id);
    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
-    // If the KV cache is RoPEd, the KV data is updated accordingly
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
    //   - explicitly with llama_kv_cache_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_shift(
+    LLAMA_API void llama_kv_cache_seq_add(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -505,7 +619,9 @@ extern "C" {
                       llama_pos   delta);
    // Integer division of the positions by factor of `d > 1`
-    // If the KV cache is RoPEd, the KV data is updated accordingly
+    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
    //   - explicitly with llama_kv_cache_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
    LLAMA_API void llama_kv_cache_seq_div(
@ -515,66 +631,117 @@ extern "C" {
                       llama_pos   p1,
                             int   d);
    // Returns the largest position present in the KV cache for the specified sequence
    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);
    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
    //   - explicitly with llama_kv_cache_update()
    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
    //
    // State / sessions
    //
    // Returns the maximum size in bytes of the state (rng, logits, embedding
    // and kv_cache) - will often be smaller after compacting tokens
-    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
+    LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
    LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
        "use llama_state_get_size instead");
    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
-    LLAMA_API size_t llama_copy_state_data(
+    LLAMA_API size_t llama_state_get_data(
            struct llama_context * ctx,
                         uint8_t * dst);
    LLAMA_API DEPRECATED(size_t llama_copy_state_data(
            struct llama_context * ctx,
                         uint8_t * dst),
        "use llama_state_get_data instead");
    // Set the state reading from the specified address
    // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(
+    LLAMA_API size_t llama_state_set_data(
            struct llama_context * ctx,
-                         uint8_t * src);
+                   const uint8_t * src);
    LLAMA_API DEPRECATED(size_t llama_set_state_data(
            struct llama_context * ctx,
                   const uint8_t * src),
        "use llama_state_set_data instead");
    // Save/load session file
-    LLAMA_API bool llama_load_session_file(
+    LLAMA_API bool llama_state_load_file(
            struct llama_context * ctx,
                      const char * path_session,
                     llama_token * tokens_out,
                          size_t   n_token_capacity,
                          size_t * n_token_count_out);
    LLAMA_API DEPRECATED(bool llama_load_session_file(
            struct llama_context * ctx,
                      const char * path_session,
                     llama_token * tokens_out,
                          size_t   n_token_capacity,
                          size_t * n_token_count_out),
        "use llama_state_load_file instead");
-    LLAMA_API bool llama_save_session_file(
+    LLAMA_API bool llama_state_save_file(
            struct llama_context * ctx,
                      const char * path_session,
               const llama_token * tokens,
                          size_t   n_token_count);
    LLAMA_API DEPRECATED(bool llama_save_session_file(
            struct llama_context * ctx,
                      const char * path_session,
               const llama_token * tokens,
                          size_t   n_token_count),
        "use llama_state_save_file instead");
    // Get the exact size needed to copy the KV cache of a single sequence
    LLAMA_API size_t llama_state_seq_get_size(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);
    // Copy the KV cache of a single sequence into the specified buffer
    LLAMA_API size_t llama_state_seq_get_data(
            struct llama_context * ctx,
                         uint8_t * dst,
                    llama_seq_id   seq_id);
    // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
    // Returns:
    //  - Positive: Ok
    //  - Zero: Failed to load
    LLAMA_API size_t llama_state_seq_set_data(
            struct llama_context * ctx,
                   const uint8_t * src,
                    llama_seq_id   dest_seq_id);
    LLAMA_API size_t llama_state_seq_save_file(
            struct llama_context * ctx,
                      const char * filepath,
                    llama_seq_id   seq_id,
               const llama_token * tokens,
                          size_t   n_token_count);
    LLAMA_API size_t llama_state_seq_load_file(
            struct llama_context * ctx,
                      const char * filepath,
                    llama_seq_id   dest_seq_id,
                     llama_token * tokens_out,
                          size_t   n_token_capacity,
                          size_t * n_token_count_out);
    //
    // Decoding
    //
    // Run the llama inference to obtain the logits and probabilities for the next token(s).
    // tokens + n_tokens is the provided batch of new tokens to process
    // n_past is the number of tokens to use from previous eval calls
    // Returns 0 on success
    // DEPRECATED: use llama_decode() instead
    LLAMA_API DEPRECATED(int llama_eval(
            struct llama_context * ctx,
                     llama_token * tokens,
                         int32_t   n_tokens,
                         int32_t   n_past),
            "use llama_decode() instead");
    // Same as llama_eval, but use float matrix input directly.
    // DEPRECATED: use llama_decode() instead
    LLAMA_API DEPRECATED(int llama_eval_embd(
            struct llama_context * ctx,
                           float * embd,
                         int32_t   n_tokens,
                         int32_t   n_past),
            "use llama_decode() instead");
    // Return batch for single sequence of tokens starting at pos_0
    //
    // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
@ -613,21 +780,61 @@ extern "C" {
    // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
    LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
-    // Token logits obtained from the last call to llama_eval()
+    // Get the number of threads used for generation of a single token.
-    // The logits for the last token are stored in the last row
+    LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
-    // Logits for which llama_batch.logits[i] == 0 are undefined
+
-    // Rows: n_tokens provided with llama_batch
+    // Get the number of threads used for prompt and batch processing (multiple token).
    LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
    // Set whether the model is in embeddings mode or not
    // If true, embeddings will be returned but logits will not
    LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
    // Set whether to use causal attention or not
    // If set to true, the model will only attend to the past tokens
    LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);
    // Set abort callback
    LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data);
    // Wait until all computations are finished
    // This is automatically done when using one of the functions below to obtain the computation results
    // and is not necessary to call it explicitly in most cases
    LLAMA_API void llama_synchronize(struct llama_context * ctx);
    // Token logits obtained from the last call to llama_decode()
    // The logits for which llama_batch.logits[i] != 0 are stored contiguously
    // in the order they have appeared in the batch.
    // Rows: number of tokens for which llama_batch.logits[i] != 0
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-    // Logits for the ith token. Equivalent to:
+    // Logits for the ith token. For positive indices, Equivalent to:
-    // llama_get_logits(ctx) + i*n_vocab
+    // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
    // returns NULL for invalid ids.
    LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
-    // Get the embeddings for the input
+    // Get all output token embeddings.
-    // shape: [n_embd] (1-dimensional)
+    // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model,
    // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously
    // in the order they have appeared in the batch.
    // shape: [n_outputs*n_embd]
    // Otherwise, returns NULL.
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
    // Get the embeddings for the ith token. For positive indices, Equivalent to:
    // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
    // shape: [n_embd] (1-dimensional)
    // returns NULL for invalid ids.
    LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
    // Get the embeddings for a sequence id
    // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
    //
    // Vocab
    //
@ -636,11 +843,19 @@ extern "C" {
    LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token);
-    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);
+    LLAMA_API enum llama_token_attr llama_token_get_attr(const struct llama_model * model, llama_token token);
    // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
    LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
    // Identify if Token Id is a control token or a render-able token
    LLAMA_API bool llama_token_is_control(const struct llama_model * model, llama_token token);
    // Special tokens
    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
    LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
    LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
    LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
    // Returns -1 if unknown, 1 for true or 0 for false.
@ -649,7 +864,7 @@ extern "C" {
    // Returns -1 if unknown, 1 for true or 0 for false.
    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
-    // codellama infill tokens
+    // Codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@ -661,27 +876,48 @@ extern "C" {
    /// @details Convert the provided text into tokens.
    /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
-    /// @return Returns the number of tokens on success, no more than n_max_tokens
+    /// @return Returns the number of tokens on success, no more than n_tokens_max
    /// @return Returns a negative number on failure - the number of tokens that would have been returned
-    /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext.
+    /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
-    ///                Does not insert a leading space.
+    ///                      as plaintext. Does not insert a leading space.
    LLAMA_API int32_t llama_tokenize(
        const struct llama_model * model,
                      const char * text,
                         int32_t   text_len,
                     llama_token * tokens,
-                         int32_t   n_max_tokens,
+                         int32_t   n_tokens_max,
-                            bool   add_bos,
+                            bool   add_special,
-                            bool   special);
+                            bool   parse_special);
    // Token Id -> Piece.
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
    // @param special If true, special tokens are rendered in the output.
    LLAMA_API int32_t llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
                               int32_t   length,
                                  bool   special);
    /// Apply chat template. Inspired by hf apply_chat_template() on python.
    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
    /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
    /// @param chat Pointer to a list of multiple llama_chat_message
    /// @param n_msg Number of llama_chat_message in this chat
    /// @param add_ass Whether to end the prompt with the token(s) that indicate the start of an assistant message.
    /// @param buf A buffer to hold the output formatted prompt. The recommended alloc size is 2 * (total number of characters of all messages)
    /// @param length The size of the allocated buffer
    /// @return The total number of bytes of the formatted prompt. If is it larger than the size of buffer, you may need to re-alloc it and then re-apply the template.
    LLAMA_API int32_t llama_chat_apply_template(
              const struct llama_model * model,
                            const char * tmpl,
       const struct llama_chat_message * chat,
                                size_t   n_msg,
                                  bool   add_ass,
                                  char * buf,
                               int32_t   length);
    //
@ -725,13 +961,6 @@ extern "C" {
                             float * logits_guidance,
                             float   scale);
    LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
              struct llama_context * ctx,
            llama_token_data_array * candidates,
              struct llama_context * guidance_ctx,
                             float   scale),
              "use llama_sample_apply_guidance() instead");
    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    LLAMA_API void llama_sample_softmax(
            struct llama_context * ctx,
@ -785,12 +1014,6 @@ extern "C" {
          llama_token_data_array * candidates,
                           float   temp);
    LLAMA_API DEPRECATED(void llama_sample_temperature(
                struct llama_context * ctx,
              llama_token_data_array * candidates,
                               float   temp),
            "use llama_sample_temp instead");
    /// @details Apply constraints from grammar
    LLAMA_API void llama_sample_grammar(
            struct llama_context * ctx,
@ -829,7 +1052,7 @@ extern "C" {
            struct llama_context * ctx,
          llama_token_data_array * candidates);
-    /// @details Randomly selects a token from the candidates based on their probabilities.
+    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
    LLAMA_API llama_token llama_sample_token(
            struct llama_context * ctx,
          llama_token_data_array * candidates);
@ -841,48 +1064,18 @@ extern "C" {
                     llama_token   token);
    //
-    // Beam search
+    // Model split
    //
-    struct llama_beam_view {
+    /// @details Build a split GGUF final path for this chunk.
-        const llama_token * tokens;
+    ///          llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf"
    //  Returns the split_path length.
    LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count);
-        size_t n_tokens;
+    /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match.
-        float  p;        // Cumulative beam probability (renormalized relative to all beams)
+    ///          llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0"
-        bool   eob;      // Callback should set this to true when a beam is at end-of-beam.
+    //  Returns the split_prefix length.
-    };
+    LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
    // Passed to beam_search_callback function.
    // Whenever 0 < common_prefix_length, this number of tokens should be copied from any of the beams
    // (e.g. beams[0]) as they will be removed (shifted) from all beams in all subsequent callbacks.
    // These pointers are valid only during the synchronous callback, so should not be saved.
    struct llama_beams_state {
        struct llama_beam_view * beam_views;
        size_t n_beams;               // Number of elements in beam_views[].
        size_t common_prefix_length;  // Current max length of prefix tokens shared by all beams.
        bool   last_call;             // True iff this is the last callback invocation.
    };
    // Type of pointer to the beam_search_callback function.
    // void* callback_data is any custom data passed to llama_beam_search, that is subsequently
    // passed back to beam_search_callback. This avoids having to use global variables in the callback.
    typedef void (*llama_beam_search_callback_fn_t)(void * callback_data, struct llama_beams_state);
    /// @details Deterministically returns entire sentence constructed by a beam search.
    /// @param ctx Pointer to the llama_context.
    /// @param callback Invoked for each iteration of the beam_search loop, passing in beams_state.
    /// @param callback_data A pointer that is simply passed back to callback.
    /// @param n_beams Number of beams to use.
    /// @param n_past Number of tokens already evaluated.
    /// @param n_predict Maximum number of tokens to predict. EOS may occur earlier.
    LLAMA_API void llama_beam_search(
                   struct llama_context * ctx,
        llama_beam_search_callback_fn_t   callback,
                                   void * callback_data,
                                 size_t   n_beams,
                                int32_t   n_past,
                                int32_t   n_predict);
    // Performance information
    LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
@ -906,15 +1099,49 @@ extern "C" {
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
-#include <vector>
+#include <random>
 #include <string>
 #include <vector>
 struct ggml_tensor;
 struct llama_partial_utf8 {
    uint32_t value;    // bit value so far (unshifted)
    int      n_remain; // num bytes remaining; -1 indicates invalid sequence
 };
 struct llama_grammar {
    const std::vector<std::vector<llama_grammar_element>>   rules;
    std::vector<std::vector<const llama_grammar_element *>> stacks;
    // buffer for partially generated UTF-8 sequence from accepted tokens
    llama_partial_utf8                                      partial_utf8;
 };
 struct llama_grammar_candidate {
    size_t               index;
    const uint32_t     * code_points;
    llama_partial_utf8   partial_utf8;
 };
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
    struct llama_context * ctx
 );
 void llama_grammar_accept(
        const std::vector<std::vector<llama_grammar_element>>         & rules,
        const std::vector<std::vector<const llama_grammar_element *>> & stacks,
        const uint32_t                                                  chr,
        std::vector<std::vector<const llama_grammar_element *>>       & new_stacks);
 std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
        const std::string & src,
        llama_partial_utf8   partial_start);
 // Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
 // This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
 llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
 #endif // LLAMA_API_INTERNAL
 #endif // LLAMA_H
--- a/examples/talk-llama/speak
+++ b/examples/talk-llama/speak
@ -1,32 +1,40 @@
 #!/bin/bash
 # Usage:
-#  speak.sh <voice_id> <text-to-speak>
+#  speak <voice_id> <textfile>
-# espeak
+function installed() { command -v $1 >/dev/null 2>&1; }
 # Mac OS: brew install espeak
 # Linux: apt-get install espeak
 #
 #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
-# piper
+if installed espeak; then
-#
+  espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
-# https://github.com/rhasspy/piper
+
-#
+elif installed piper && installed aplay; then
-# Tested with Linux:
+  cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
 #
 #echo "$2" | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
 # for Mac
-say "$2"
+elif installed say; then
  say -f $2
 # Eleven Labs
-# To use it, install the elevenlabs module from pip (pip install elevenlabs)
+elif installed python3 && \
-# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
+  python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
-#Keep the line commented to use the free version whitout api key
+  installed ffplay; then
-#
+    # It's possible to use the API for free with limited number of characters.
-#export ELEVEN_API_KEY=your_api_key
+    # To increase this limit register to https://beta.elevenlabs.io to get an api key
-#wd=$(dirname $0)
+    # and paste it after 'ELEVEN_API_KEY='
-#script=$wd/eleven-labs.py
+    # Keep the line commented to use the free version without api key
-#python3 $script $1 "$2" >/dev/null 2>&1
+    #export ELEVEN_API_KEY=your_api_key
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
+    wd=$(dirname $0)
    script=$wd/eleven-labs.py
    python3 $script -q -p -v $1 $2 >/dev/null 2>&1
    # Uncomment to keep the audio file
    #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
    #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
 else
  echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
  echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
  echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
  echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
 fi
--- a/examples/talk-llama/speak.bat
+++ b/examples/talk-llama/speak.bat
@ -1 +1 @@
-@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2
+@powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2
--- a/examples/talk-llama/speak.ps1
+++ b/examples/talk-llama/speak.ps1
@ -1,12 +1,14 @@
 # Set-ExecutionPolicy -ExecutionPolicy Bypass -Scope CurrentUser
 param(
-  # voice options are David or Zira
+  [Parameter(Mandatory=$true)][int]$voicenum,
-  [Parameter(Mandatory=$true)][string]$voice,
+  [Parameter(Mandatory=$true)][string]$textfile
  [Parameter(Mandatory=$true)][string]$text
 )
 Add-Type -AssemblyName System.Speech;
 $speak = New-Object System.Speech.Synthesis.SpeechSynthesizer;
-$speak.SelectVoice("Microsoft $voice Desktop");
+$voiceoptions = $speak.GetInstalledVoices("en-US");
 $voice = $voiceoptions[$voicenum % $voiceoptions.count];
 $speak.SelectVoice($voice.VoiceInfo.Name);
 $speak.Rate="0";
 $text = Get-Content -Path $textfile;
 $speak.Speak($text);
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -16,7 +16,7 @@
 #include <regex>
 #include <sstream>
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    auto * model = llama_get_model(ctx);
    // upper limit for the number of tokens
@ -33,12 +33,12 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
    return result;
 }
-std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+    const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
+        int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
        GGML_ASSERT(check == -n_tokens);
    } else {
        result.resize(n_tokens);
@ -59,13 +59,13 @@ struct whisper_params {
    float vad_thold  = 0.6f;
    float freq_thold = 100.0f;
    bool speed_up       = false;
    bool translate      = false;
    bool print_special  = false;
    bool print_energy   = false;
    bool no_timestamps  = true;
    bool verbose_prompt = false;
    bool use_gpu        = true;
    bool flash_attn     = false;
    std::string person      = "Georgi";
    std::string bot_name    = "LLaMA";
@ -75,6 +75,7 @@ struct whisper_params {
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
    std::string speak       = "./examples/talk-llama/speak";
    std::string speak_file  = "./examples/talk-llama/to_speak.txt";
    std::string prompt      = "";
    std::string fname_out;
    std::string path_session = "";       // path to file for saving/loading model eval state
@ -82,7 +83,7 @@ struct whisper_params {
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
@ -98,12 +99,12 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
        else if (arg == "-vp"  || arg == "--verbose-prompt") { params.verbose_prompt = true; }
        else if (arg == "-ng"  || arg == "--no-gpu")         { params.use_gpu        = false; }
        else if (arg == "-fa"  || arg == "--flash-attn")     { params.flash_attn     = true; }
        else if (arg == "-p"   || arg == "--person")         { params.person         = argv[++i]; }
        else if (arg == "-bn"   || arg == "--bot-name")      { params.bot_name       = argv[++i]; }
        else if (arg == "--session")                         { params.path_session   = argv[++i]; }
@ -113,6 +114,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-mw"  || arg == "--model-whisper")  { params.model_wsp      = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")    { params.model_llama    = argv[++i]; }
        else if (arg == "-s"   || arg == "--speak")          { params.speak          = argv[++i]; }
        else if (arg == "-sf"  || arg == "--speak-file")     { params.speak_file     = argv[++i]; }
        else if (arg == "--prompt-file")                     {
            std::ifstream file(argv[++i]);
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
@ -121,7 +123,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            }
        }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else if (arg == "-ng"  || arg == "--no-gpu")        { params.use_gpu       = false; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -146,12 +147,12 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -vp,      --verbose-prompt [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
    fprintf(stderr, "  -ng,      --no-gpu         [%-7s] disable GPU\n",                                 params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,      --flash-attn     [%-7s] flash attention\n",                             params.flash_attn ? "true" : "false");
    fprintf(stderr, "  -p NAME,  --person NAME    [%-7s] person name (for prompt selection)\n",          params.person.c_str());
    fprintf(stderr, "  -bn NAME, --bot-name NAME  [%-7s] bot name (to display)\n",                       params.bot_name.c_str());
    fprintf(stderr, "  -w TEXT,  --wake-command T [%-7s] wake-up command to listen for\n",               params.wake_cmd.c_str());
@ -160,13 +161,14 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -mw FILE, --model-whisper  [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -ml FILE, --model-llama    [%-7s] llama model file\n",                            params.model_llama.c_str());
    fprintf(stderr, "  -s FILE,  --speak TEXT     [%-7s] command for TTS\n",                             params.speak.c_str());
    fprintf(stderr, "  -sf FILE, --speak-file     [%-7s] file to pass to TTS\n",                         params.speak_file.c_str());
    fprintf(stderr, "  --prompt-file FNAME        [%-7s] file with custom prompt to start dialog\n",     "");
    fprintf(stderr, "  --session FNAME                   file to cache model state in (may be large!) (default: none)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
 }
-std::string transcribe(
+static std::string transcribe(
        whisper_context * ctx,
        const whisper_params & params,
        const std::vector<float> & pcmf32,
@ -200,7 +202,6 @@ std::string transcribe(
    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
@ -234,7 +235,7 @@ std::string transcribe(
    return result;
 }
-std::vector<std::string> get_words(const std::string &txt) {
+static std::vector<std::string> get_words(const std::string &txt) {
    std::vector<std::string> words;
    std::istringstream iss(txt);
@ -282,13 +283,19 @@ int main(int argc, char ** argv) {
    // whisper init
    struct whisper_context_params cparams = whisper_context_default_params();
-    cparams.use_gpu = params.use_gpu;
+
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx_wsp = whisper_init_from_file_with_params(params.model_wsp.c_str(), cparams);
    if (!ctx_wsp) {
        fprintf(stderr, "No whisper.cpp model specified. Please provide using -mw <modelfile>\n");
        return 1;
    }
    // llama init
-    llama_backend_init(true);
+    llama_backend_init();
    auto lmparams = llama_model_default_params();
    if (!params.use_gpu) {
@ -298,6 +305,10 @@ int main(int argc, char ** argv) {
    }
    struct llama_model * model_llama = llama_load_model_from_file(params.model_llama.c_str(), lmparams);
    if (!model_llama) {
        fprintf(stderr, "No llama.cpp model specified. Please provide using -ml <modelfile>\n");
        return 1;
    }
    llama_context_params lcparams = llama_context_default_params();
@ -305,6 +316,7 @@ int main(int argc, char ** argv) {
    lcparams.n_ctx      = 2048;
    lcparams.seed       = 1;
    lcparams.n_threads  = params.n_threads;
    lcparams.flash_attn = params.flash_attn;
    struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
@ -388,6 +400,8 @@ int main(int argc, char ** argv) {
    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
    llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
    // init session
    std::string path_session = params.path_session;
    std::vector<llama_token> session_tokens;
@ -423,8 +437,21 @@ int main(int argc, char ** argv) {
    printf("\n");
    printf("%s : initializing - please wait ...\n", __func__);
-    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0)) {
+    // prepare batch
-        fprintf(stderr, "%s : failed to eval\n", __func__);
+    {
        batch.n_tokens = embd_inp.size();
        for (int i = 0; i < batch.n_tokens; i++) {
            batch.token[i]     = embd_inp[i];
            batch.pos[i]       = i;
            batch.n_seq_id[i]  = 1;
            batch.seq_id[i][0] = 0;
            batch.logits[i]    = i == batch.n_tokens - 1;
        }
    }
    if (llama_decode(ctx_llama, batch)) {
        fprintf(stderr, "%s : failed to decode\n", __func__);
        return 1;
    }
@ -546,10 +573,7 @@ int main(int argc, char ** argv) {
                // optionally give audio feedback that the current text is being processed
                if (!params.heard_ok.empty()) {
-                    int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + params.heard_ok + "'").c_str());
+                    speak_with_file(params.speak, params.heard_ok, params.speak_file, voice_id);
                    if (ret != 0) {
                        fprintf(stderr, "%s: failed to speak\n", __func__);
                    }
                }
                // remove text between brackets using regex
@ -647,8 +671,21 @@ int main(int argc, char ** argv) {
                            n_session_consumed = session_tokens.size();
                        }
-                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past)) {
+                        // prepare batch
-                            fprintf(stderr, "%s : failed to eval\n", __func__);
+                        {
                            batch.n_tokens = embd.size();
                            for (int i = 0; i < batch.n_tokens; i++) {
                                batch.token[i]     = embd[i];
                                batch.pos[i]       = n_past + i;
                                batch.n_seq_id[i]  = 1;
                                batch.seq_id[i][0] = 0;
                                batch.logits[i]    = i == batch.n_tokens - 1;
                            }
                        }
                        if (llama_decode(ctx_llama, batch)) {
                            fprintf(stderr, "%s : failed to decode\n", __func__);
                            return 1;
                        }
                    }
@ -748,11 +785,7 @@ int main(int argc, char ** argv) {
                    }
                }
-                text_to_speak = ::replace(text_to_speak, "'", "'\"'\"'");
+                speak_with_file(params.speak, text_to_speak, params.speak_file, voice_id);
                int ret = system((params.speak + " " + std::to_string(voice_id) + " '" + text_to_speak + "'").c_str());
                if (ret != 0) {
                    fprintf(stderr, "%s: failed to speak\n", __func__);
                }
                audio.clear();
            }
--- a/examples/talk-llama/unicode-data.cpp
+++ b/examples/talk-llama/unicode-data.cpp
--- a/examples/talk-llama/unicode-data.h
+++ b/examples/talk-llama/unicode-data.h
@ -0,0 +1,20 @@
 #pragma once
 #include <cstdint>
 #include <vector>
 #include <unordered_map>
 #include <unordered_set>
 struct range_nfd {
    uint32_t first;
    uint32_t last;
    uint32_t nfd;
 };
 static const uint32_t MAX_CODEPOINTS = 0x110000;
 extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
 extern const std::unordered_set<uint32_t> unicode_set_whitespace;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
 extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
 extern const std::vector<range_nfd> unicode_ranges_nfd;
--- a/examples/talk-llama/unicode.cpp
+++ b/examples/talk-llama/unicode.cpp
@ -0,0 +1,810 @@
 #include "unicode.h"
 #include "unicode-data.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <map>
 #include <regex>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include <locale>
 #include <codecvt>
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
        result.append(unicode_cpt_to_utf8(cps[i]));
    }
    return result;
 }
 static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];
        offset += 1;
        return result;
    }
    if (!(utf8[offset + 0] & 0x40)) {
        throw std::invalid_argument("invalid character");
    }
    if (!(utf8[offset + 0] & 0x20)) {
        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
            throw std::invalid_argument("invalid character");
        }
        auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
        offset += 2;
        return result;
    }
    if (!(utf8[offset + 0] & 0x10)) {
        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
            throw std::invalid_argument("invalid character");
        }
        auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
        offset += 3;
        return result;
    }
    if (!(utf8[offset + 0] & 0x08)) {
        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
            throw std::invalid_argument("invalid character");
        }
        auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
        offset += 4;
        return result;
    }
    throw std::invalid_argument("failed to convert utf8 to codepoint");
 }
 //static std::vector<uint16_t> unicode_cpt_to_utf16(uint32_t cp) {
 //    std::vector<uint16_t> result;
 //    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
 //        result.emplace_back(cp);
 //        return result;
 //    }
 //    if (0x10000 <= cp && cp <= 0x10ffff) {
 //        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
 //        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
 //        return result;
 //    }
 //    throw std::invalid_argument("failed to convert codepoint to utf16");
 //}
 //static std::vector<uint16_t> unicode_cpts_to_utf16(const std::vector<uint32_t> & cps) {
 //    std::vector<uint16_t> result;
 //    for (size_t i = 0; i < cps.size(); ++i) {
 //        auto temp = unicode_cpt_to_utf16(cps[i]);
 //        result.insert(result.end(), temp.begin(), temp.end());
 //    }
 //    return result;
 //}
 //static uint32_t unicode_cpt_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
 //    assert(offset < utf16.size());
 //    if (((utf16[0] >> 10) << 10) != 0xd800) {
 //        auto result = utf16[offset + 0];
 //        offset += 1;
 //        return result;
 //    }
 //
 //    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
 //        throw std::invalid_argument("invalid character");
 //    }
 //
 //    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
 //    offset += 2;
 //    return result;
 //}
 //static std::vector<uint32_t> unicode_cpts_from_utf16(const std::vector<uint16_t> & utf16) {
 //    std::vector<uint32_t> result;
 //    size_t offset = 0;
 //    while (offset < utf16.size()) {
 //        result.push_back(unicode_cpt_from_utf16(utf16, offset));
 //    }
 //    return result;
 //}
 static std::vector<codepoint_flags> unicode_cpt_flags_array() {
    std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
    assert (unicode_ranges_flags.front().first == 0);
    assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
    for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
        const auto range_ini = unicode_ranges_flags[i-1];  // codepoint_ini, flags
        const auto range_end = unicode_ranges_flags[i];    // codepoint_end, flags
        for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
            cpt_flags[cpt] = range_ini.second;
        }
    }
    for (auto cpt : unicode_set_whitespace) {
        cpt_flags[cpt].is_whitespace = true;
    }
    for (auto p : unicode_map_lowercase) {
        cpt_flags[p.second].is_lowercase = true;
    }
    for (auto p : unicode_map_uppercase) {
        cpt_flags[p.second].is_uppercase = true;
    }
    for (auto &range : unicode_ranges_nfd) {  // start, last, nfd
        cpt_flags[range.nfd].is_nfd = true;
    }
    return cpt_flags;
 }
 static std::unordered_map<uint8_t, std::string> unicode_byte_to_utf8_map() {
    std::unordered_map<uint8_t, std::string> map;
    for (int ch = 0x21; ch <= 0x7E; ++ch) {  // u'!' to u'~'
        assert(0 <= ch && ch < 256);
        map[ch] = unicode_cpt_to_utf8(ch);
    }
    for (int ch = 0xA1; ch <= 0xAC; ++ch) {  // u'¡' to u'¬'
        assert(0 <= ch && ch < 256);
        map[ch] = unicode_cpt_to_utf8(ch);
    }
    for (int ch = 0xAE; ch <= 0xFF; ++ch) {  // u'®' to u'ÿ'
        assert(0 <= ch && ch < 256);
        map[ch] = unicode_cpt_to_utf8(ch);
    }
    auto n = 0;
    for (int ch = 0; ch < 256; ++ch) {
        if (map.find(ch) == map.end()) {
            map[ch] = unicode_cpt_to_utf8(256 + n);
            ++n;
        }
    }
    return map;
 }
 static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
    std::unordered_map<std::string, uint8_t> map;
    for (int ch = 0x21; ch <= 0x7E; ++ch) {  // u'!' to u'~'
        assert(0 <= ch && ch < 256);
        map[unicode_cpt_to_utf8(ch)] = ch;
    }
    for (int ch = 0xA1; ch <= 0xAC; ++ch) {  // u'¡' to u'¬'
        assert(0 <= ch && ch < 256);
        map[unicode_cpt_to_utf8(ch)] = ch;
    }
    for (int ch = 0xAE; ch <= 0xFF; ++ch) {  // u'®' to u'ÿ'
        assert(0 <= ch && ch < 256);
        map[unicode_cpt_to_utf8(ch)] = ch;
    }
    auto n = 0;
    for (int ch = 0; ch < 256; ++ch) {
        if (map.find(unicode_cpt_to_utf8(ch)) == map.end()) {
            map[unicode_cpt_to_utf8(256 + n)] = ch;
            ++n;
        }
    }
    return map;
 }
 static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
    return conv.from_bytes(s);
 }
 static std::vector<std::string> unicode_byte_encoding_process(const std::vector<std::string> & bpe_words) {
    std::vector<std::string> bpe_encoded_words;
    for (const auto & word : bpe_words) {
        std::string text_utf;
        auto utf_word =  unicode_cpts_from_utf8(word);
        for (size_t i = 0; i < utf_word.size(); ++i) {
            text_utf += unicode_cpt_to_utf8(utf_word[i]);
        }
        std::string encoded_token;
        for (char & c : text_utf) {
            encoded_token += unicode_byte_to_utf8(c);
        }
        bpe_encoded_words.emplace_back(encoded_token);
    }
    return bpe_encoded_words;
 }
 // GPT2 system regex:  's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+
 static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & text, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    const auto cpts = unicode_cpts_from_utf8(text);
    size_t start = 0;
    for (auto offset : offsets) {
        const size_t offset_ini = start;
        const size_t offset_end = start + offset;
        assert(offset_end <= cpts.size());
        start = offset_end;
        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
        auto _add_token = [&] (const size_t end) -> size_t {
            assert(_prev_end <= end && end <= offset_end);
            size_t len = end - _prev_end;
            if (len > 0) {
                bpe_offsets.push_back(len);
            }
            _prev_end = end;
            //if (len > 0) {
            //    std::string s = "";
            //    for(size_t p = end-len; p < end; p++)
            //        s += unicode_cpt_to_utf8(cpts[p]);
            //    printf(">>> '%s'\n", s.c_str());
            //}
            return len;
        };
        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
            const uint32_t cpt = _get_cpt(pos);
            const auto flags = _get_flags(pos);
            // regex: 's|'t|'re|'ve|'m|'ll|'d
            if (cpt == '\'' && pos+1 < offset_end) {
                uint32_t cpt_next = _get_cpt(pos+1);
                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
                    pos += _add_token(pos+2);
                    continue;
                }
                if (pos+2 < offset_end) {
                    uint32_t cpt_next_next = _get_cpt(pos+2);
                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
                        (cpt_next == 'v' && cpt_next_next == 'e') ||
                        (cpt_next == 'l' && cpt_next_next == 'l')) {
                        pos += _add_token(pos+3);
                        continue;
                    }
                }
            }
            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
            // regex: <space>?\p{L}+
            if (flags2.is_letter) {
                pos += (cpt == ' ');
                while (flags2.is_letter) {
                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
                continue;
            }
            // regex: <space>?\p{N}+
            if (flags2.is_number) {
                pos += (cpt == ' ');
                while (flags2.is_number) {
                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+
            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                pos += (cpt == ' ');
                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                    flags2 = _get_flags(++pos);
                }
                _add_token(pos);
                continue;
            }
            size_t num_whitespaces = 0;
            while (_get_flags(pos+num_whitespaces).is_whitespace) {
                num_whitespaces++;
            }
            // regex: \s+(?!\S)
            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
            }
            // regex: \s+
            if (num_whitespaces > 0) {
                pos += num_whitespaces;
                _add_token(pos);
                continue;
            }
            // no matches
            _add_token(++pos);
        }
    }
    return bpe_offsets;
 }
 // LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
 static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    const auto cpts = unicode_cpts_from_utf8(text);
    size_t start = 0;
    for (auto offset : offsets) {
        const size_t offset_ini = start;
        const size_t offset_end = start + offset;
        assert(offset_end <= cpts.size());
        start = offset_end;
        static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
        auto _get_cpt = [&] (const size_t pos) -> uint32_t {
            return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
        };
        auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
            static const codepoint_flags undef(codepoint_flags::UNDEFINED);
            return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
        };
        size_t _prev_end = offset_ini;
        auto _add_token = [&] (const size_t end) -> size_t {
            assert(_prev_end <= end && end <= offset_end);
            size_t len = end - _prev_end;
            if (len > 0) {
                bpe_offsets.push_back(len);
            }
            _prev_end = end;
            //if (len > 0) {
            //    std::string s = "";
            //    for(size_t p = end-len; p < end; p++)
            //        s += unicode_cpt_to_utf8(cpts[p]);
            //    printf(">>> '%s'\n", s.c_str());
            //}
            return len;
        };
        for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
            const uint32_t cpt = _get_cpt(pos);
            const auto flags = _get_flags(pos);
            // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
            if (cpt == '\'' && pos+1 < offset_end) {
                uint32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
                if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
                    pos += _add_token(pos+2);
                    continue;
                }
                if (pos+2 < offset_end) {
                    uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
                    if ((cpt_next == 'r' && cpt_next_next == 'e') ||
                        (cpt_next == 'v' && cpt_next_next == 'e') ||
                        (cpt_next == 'l' && cpt_next_next == 'l')) {
                        pos += _add_token(pos+3);
                        continue;
                    }
                }
            }
            // regex: [^\r\n\p{L}\p{N}]?\p{L}+  //####FIXME: the first \p{L} is correct?
            if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
                if (flags.is_letter || _get_flags(pos+1).is_letter) {  // one or more letters
                    pos++;
                    while (_get_flags(pos).is_letter) {
                        pos++;
                    }
                    _add_token(pos);
                    continue;
                }
            }
            // regex: \p{N}{1,3}
            if (flags.is_number) {
                size_t ini = pos;
                while (_get_flags(pos).is_number) {
                    if (++pos - ini >= 3 ) {
                        _add_token(pos);
                        ini = pos;
                    }
                }
                _add_token(pos);
                continue;
            }
            // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
            auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
            if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                pos += (cpt == ' ');
                while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
                    flags2 = _get_flags(++pos);
                }
                uint32_t cpt2 = _get_cpt(pos);
                while (cpt2 == '\r' || cpt2 == '\n') {
                    cpt2 = _get_cpt(++pos);
                }
                _add_token(pos);
                continue;
            }
            size_t num_whitespaces = 0;
            size_t last_end_r_or_n = 0;
            while (_get_flags(pos+num_whitespaces).is_whitespace) {
                uint32_t cpt2 = _get_cpt(pos+num_whitespaces);
                if (cpt2 == '\r' || cpt2 == '\n') {
                    last_end_r_or_n = pos + num_whitespaces + 1;
                }
                num_whitespaces++;
            }
            // regex: \s*[\r\n]+
            if (last_end_r_or_n > 0) {
                pos = last_end_r_or_n;
                _add_token(pos);
                continue;
            }
            // regex: \s+(?!\S)
            if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
                pos += num_whitespaces - 1;
                _add_token(pos);
                continue;
            }
            // regex: \s+
            if (num_whitespaces > 0) {
                pos += num_whitespaces;
                _add_token(pos);
                continue;
            }
            // no matches
            _add_token(++pos);
        }
    }
    return bpe_offsets;
 }
 // use std::wregex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector<size_t> & offsets) {
    std::wregex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    for (auto offset : offsets) {
        std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr);
        std::wcregex_iterator end;
        int64_t start_idx = 0;
        while (it != end) {
            std::wcmatch match = *it;
            if (match.position() > start_idx) {
                bpe_offsets.emplace_back(match.position() - start_idx);
            }
            bpe_offsets.emplace_back(match.length());
            start_idx = match.position() + match.length();
            ++it;
        }
        if (start_idx < (int64_t) offset) {
            bpe_offsets.emplace_back(offset - start_idx);
        }
        start += offset;
    }
    return bpe_offsets;
 }
 // use std::regex to split the text
 static std::vector<size_t> unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::regex expr(regex_expr);
    std::vector<size_t> bpe_offsets; // store the offset of each word
    bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
    size_t start = 0;
    for (auto offset : offsets) {
        std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr);
        std::cregex_iterator end;
        int64_t start_idx = 0;
        while (it != end) {
            std::cmatch match = *it;
            if (match.position() > start_idx) {
                bpe_offsets.emplace_back(match.position() - start_idx);
            }
            bpe_offsets.emplace_back(match.length());
            start_idx = match.position() + match.length();
            ++it;
        }
        if (start_idx < (int64_t) offset) {
            bpe_offsets.emplace_back(offset - start_idx);
        }
        start += offset;
    }
    return bpe_offsets;
 }
 static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
    std::vector<size_t> bpe_offsets;
    if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
        bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
    } else if (
            regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
            regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
        bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
    }
    return bpe_offsets;
 }
 //
 // interface
 //
 std::string unicode_cpt_to_utf8(uint32_t cp) {
    std::string result;
    if (/* 0x00 <= cp && */ cp <= 0x7f) {
        result.push_back(cp);
        return result;
    }
    if (0x80 <= cp && cp <= 0x7ff) {
        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
        result.push_back(0x80 | (cp & 0x3f));
        return result;
    }
    if (0x800 <= cp && cp <= 0xffff) {
        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
        return result;
    }
    if (0x10000 <= cp && cp <= 0x10ffff) {
        result.push_back(0xf0 | ((cp >> 18) & 0x07));
        result.push_back(0x80 | ((cp >> 12) & 0x3f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
        return result;
    }
    throw std::invalid_argument("invalid codepoint");
 }
 std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts) {
    auto comp = [] (const uint32_t cpt, const range_nfd & range) {
        return cpt < range.first;
    };
    std::vector<uint32_t> result(cpts.size());
    for (size_t i = 0; i < cpts.size(); ++i) {
        const uint32_t cpt = cpts[i];
        auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
        result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
    }
    return result;
 }
 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    std::vector<uint32_t> result;
    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
        result.push_back(unicode_cpt_from_utf8(utf8, offset));
    }
    return result;
 }
 codepoint_flags unicode_cpt_flags(const uint32_t cp) {
    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
    static const auto cpt_flags = unicode_cpt_flags_array();
    return cp < cpt_flags.size() ? cpt_flags[cp] : undef;
 }
 codepoint_flags unicode_cpt_flags(const std::string & utf8) {
    static const codepoint_flags undef(codepoint_flags::UNDEFINED);
    if (utf8.empty()) {
        return undef;  // undefined
    }
    size_t offset = 0;
    return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset));
 }
 std::string unicode_byte_to_utf8(uint8_t byte) {
    static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
    return map.at(byte);
 }
 uint8_t unicode_utf8_to_byte(const std::string & utf8) {
    static std::unordered_map<std::string, uint8_t> map = unicode_utf8_to_byte_map();
    return map.at(utf8);
 }
 uint32_t unicode_tolower(uint32_t cp) {
    auto it = unicode_map_lowercase.find(cp);
    return it == unicode_map_lowercase.end() ? cp : it->second;
 }
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
    // unicode categories
    static const std::map<std::string, int> k_ucat_enum = {
        { "\\p{N}", codepoint_flags::NUMBER },
        { "\\p{L}", codepoint_flags::LETTER },
        { "\\p{P}", codepoint_flags::PUNCTUATION },
    };
    static const std::map<int, int> k_ucat_cpt = {
        { codepoint_flags::NUMBER,        0xD1 },
        { codepoint_flags::LETTER,        0xD2 },
        { codepoint_flags::PUNCTUATION,   0xD3 },
    };
    static const std::map<int, std::string> k_ucat_map = {
        { codepoint_flags::NUMBER,        "\x30-\x39" }, // 0-9
        { codepoint_flags::LETTER,        "\x41-\x5A\x61-\x7A" }, // A-Za-z
        { codepoint_flags::PUNCTUATION,   "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\}
    };
    // compute collapsed codepoints only if needed by at least one regex
    bool need_collapse = false;
    for (auto & regex_expr : regex_exprs) {
        // search for unicode categories
        for (const auto & ucat : k_ucat_enum) {
            if (std::string::npos != regex_expr.find(ucat.first)) {
                need_collapse = true;
                break;
            }
        }
    }
    const auto cpts = unicode_cpts_from_utf8(text);
    // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
    std::string text_collapsed;
    if (need_collapse) {
        // collapse all unicode categories
        text_collapsed.resize(cpts.size());
        for (size_t i = 0; i < cpts.size(); ++i) {
            // keep single-byte codepoints as is
            if (cpts[i] < 128) {
                text_collapsed[i] = cpts[i];
                continue;
            }
            const auto flags = unicode_cpt_flags(cpts[i]);
            if (flags.is_whitespace) {
                //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
                //text_collapsed[i] = (char) 0x85;  // <Next Line> as whitespace fallback
                text_collapsed[i] = (char) 0x0B;    // <vertical tab> as whitespace fallback
            } else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
                text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
            } else {
                text_collapsed[i] = (char) 0xD0; // fallback
            }
        }
    }
    std::vector<size_t> bpe_offsets = { cpts.size() };
    for (auto & regex_expr : regex_exprs) {
        // first, see if we have an efficient custom regex implementation
        auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets);
        if (!tmp.empty()) {
            bpe_offsets = std::move(tmp);
            continue;
        }
        // fallback to general-purpose std::regex / std::wregex
        try {
            // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category
            // with the corresponding collapsed representation
            bool use_collapsed = false;
            for (auto & ucat : k_ucat_enum) {
                if (std::string::npos != regex_expr.find(ucat.first)) {
                    use_collapsed = true;
                    break;
                }
            }
            if (use_collapsed) {
                // sanity-check that the original regex does not contain any non-ASCII characters
                const auto cpts_regex = unicode_cpts_from_utf8(regex_expr);
                for (size_t i = 0; i < cpts_regex.size(); ++i) {
                    if (cpts_regex[i] >= 128) {
                        throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported");
                    }
                }
                // generate a collapsed representation of the regex
                std::string regex_expr_collapsed;
                // track if we are inside [], because nested [] are not allowed
                bool inside = false;
                for (size_t i = 0; i < regex_expr.size(); ++i) {
                    if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) {
                        regex_expr_collapsed += '[';
                        inside = true;
                        continue;
                    }
                    if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') {
                        regex_expr_collapsed += ']';
                        inside = false;
                        continue;
                    }
                    if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() &&
                        regex_expr[i + 1] == 'p' &&
                        regex_expr[i + 2] == '{' &&
                        regex_expr[i + 4] == '}') {
                        const std::string pat = regex_expr.substr(i, 5);
                        if (k_ucat_enum.find(pat) != k_ucat_enum.end()) {
                            if (!inside) {
                                regex_expr_collapsed += '[';
                            }
                            regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat));
                            regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat));
                            if (!inside) {
                                regex_expr_collapsed += ']';
                            }
                            i += 4;
                            continue;
                        }
                    }
                    regex_expr_collapsed += regex_expr[i];
                }
                //printf("text_collapsed: %s\n", text_collapsed.c_str());
                //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str());
                bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
            } else {
                // no unicode category used, we can use std::wregex directly
                const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
                // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
                std::wstring wtext(cpts.begin(), cpts.end());
                for (size_t i = 0; i < wtext.size(); ++i) {
                    if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
                        wtext[i] = 0x0B;
                    }
                }
                //printf("text: %s\n", text.c_str());
                //printf("regex_expr: %s\n", regex_expr.c_str());
                bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);
            }
        } catch (std::regex_error & e) {
            fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str());
            fprintf(stderr, "Regex error: %s\n", e.what());
            throw std::runtime_error("Failed to process regex");
        }
    }
    std::vector<std::string> bpe_words;
    bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size
    size_t start = 0;
    for (size_t & offset : bpe_offsets) {
        bpe_words.emplace_back();
        for (size_t i = start; i < start + offset; ++i) {
            bpe_words.back() += unicode_cpt_to_utf8(cpts[i]);
        }
        start += offset;
    }
    return unicode_byte_encoding_process(bpe_words);
 }
--- a/examples/talk-llama/unicode.h
+++ b/examples/talk-llama/unicode.h
@ -1,463 +1,63 @@
-#pragma once
+#pragma once
-#include <cassert>
+#include <cstdint>
 #include <stdexcept>
 #include <string>
 #include <unordered_map>
 #include <vector>
-static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
+struct codepoint_flags {
-{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
+    enum {
-{0xCE6, 0xCEF}, {0xD66, 0xD6F}, {0xDE6, 0xDEF}, {0xE50, 0xE59}, {0xED0, 0xED9}, {0xF20, 0xF29}, {0x1040, 0x1049}, {0x1090, 0x1099}, {0x1369, 0x1371}, {0x17E0, 0x17E9}, {0x1810, 0x1819}, {0x1946, 0x194F},
+        UNDEFINED       = 0x0001,
-{0x19D0, 0x19DA}, {0x1A80, 0x1A89}, {0x1A90, 0x1A99}, {0x1B50, 0x1B59}, {0x1BB0, 0x1BB9}, {0x1C40, 0x1C49}, {0x1C50, 0x1C59}, {0x2070, 0x2070}, {0x2074, 0x2079}, {0x2080, 0x2089}, {0x2460, 0x2468},
+        NUMBER          = 0x0002,  // regex: \p{N}
-{0x2474, 0x247C}, {0x2488, 0x2490}, {0x24EA, 0x24EA}, {0x24F5, 0x24FD}, {0x24FF, 0x24FF}, {0x2776, 0x277E}, {0x2780, 0x2788}, {0x278A, 0x2792}, {0xA620, 0xA629}, {0xA8D0, 0xA8D9}, {0xA900, 0xA909},
+        LETTER          = 0x0004,  // regex: \p{L}
-{0xA9D0, 0xA9D9}, {0xA9F0, 0xA9F9}, {0xAA50, 0xAA59}, {0xABF0, 0xABF9}, {0xFF10, 0xFF19}, {0x104A0, 0x104A9}, {0x10A40, 0x10A43}, {0x10D30, 0x10D39}, {0x10E60, 0x10E68}, {0x11052, 0x1105A},
+        SEPARATOR       = 0x0008,  // regex: \p{Z}
-{0x11066, 0x1106F}, {0x110F0, 0x110F9}, {0x11136, 0x1113F}, {0x111D0, 0x111D9}, {0x112F0, 0x112F9}, {0x11450, 0x11459}, {0x114D0, 0x114D9}, {0x11650, 0x11659}, {0x116C0, 0x116C9}, {0x11730, 0x11739},
+        ACCENT_MARK     = 0x0010,  // regex: \p{M}
-{0x118E0, 0x118E9}, {0x11950, 0x11959}, {0x11C50, 0x11C59}, {0x11D50, 0x11D59}, {0x11DA0, 0x11DA9}, {0x16A60, 0x16A69}, {0x16B50, 0x16B59}, {0x1D7CE, 0x1D7FF}, {0x1E140, 0x1E149}, {0x1E2F0, 0x1E2F9},
+        PUNCTUATION     = 0x0020,  // regex: \p{P}
-{0x1E950, 0x1E959}, {0x1F100, 0x1F10A}, {0x1FBF0, 0x1FBF9},
+        SYMBOL          = 0x0040,  // regex: \p{S}
        CONTROL         = 0x0080,  // regex: \p{C}
        MASK_CATEGORIES = 0x00FF,
    };
    // codepoint type
    uint16_t is_undefined   : 1;
    uint16_t is_number      : 1;  // regex: \p{N}
    uint16_t is_letter      : 1;  // regex: \p{L}
    uint16_t is_separator   : 1;  // regex: \p{Z}
    uint16_t is_accent_mark : 1;  // regex: \p{M}
    uint16_t is_punctuation : 1;  // regex: \p{P}
    uint16_t is_symbol      : 1;  // regex: \p{S}
    uint16_t is_control     : 1;  // regex: \p{C}
    // helper flags
    uint16_t is_whitespace  : 1;  // regex: \s
    uint16_t is_lowercase   : 1;
    uint16_t is_uppercase   : 1;
    uint16_t is_nfd         : 1;
    // decode from uint16
    inline codepoint_flags(const uint16_t flags=0) {
        *reinterpret_cast<uint16_t*>(this) = flags;
    }
    inline uint16_t as_uint() const {
        return *reinterpret_cast<const uint16_t*>(this);
    }
    inline uint16_t category_flag() const {
        return this->as_uint() & MASK_CATEGORIES;
    }
 };
 static const std::vector<std::pair<uint32_t, uint32_t>> letter_ranges = {
 {0x41, 0x5A}, {0x61, 0x7A}, {0xAA, 0xAA}, {0xB5, 0xB5}, {0xBA, 0xBA}, {0xC0, 0xD6}, {0xD8, 0xF6}, {0xF8, 0x2C1}, {0x2C6, 0x2D1}, {0x2E0, 0x2E4}, {0x2EC, 0x2EC}, {0x2EE, 0x2EE}, {0x370, 0x374},
 {0x376, 0x377}, {0x37A, 0x37D}, {0x37F, 0x37F}, {0x386, 0x386}, {0x388, 0x38A}, {0x38C, 0x38C}, {0x38E, 0x3A1}, {0x3A3, 0x3F5}, {0x3F7, 0x481}, {0x48A, 0x52F}, {0x531, 0x556}, {0x559, 0x559},
 {0x560, 0x588}, {0x5D0, 0x5EA}, {0x5EF, 0x5F2}, {0x620, 0x64A}, {0x66E, 0x66F}, {0x671, 0x6D3}, {0x6D5, 0x6D5}, {0x6E5, 0x6E6}, {0x6EE, 0x6EF}, {0x6FA, 0x6FC}, {0x6FF, 0x6FF}, {0x710, 0x710},
 {0x712, 0x72F}, {0x74D, 0x7A5}, {0x7B1, 0x7B1}, {0x7CA, 0x7EA}, {0x7F4, 0x7F5}, {0x7FA, 0x7FA}, {0x800, 0x815}, {0x81A, 0x81A}, {0x824, 0x824}, {0x828, 0x828}, {0x840, 0x858}, {0x860, 0x86A},
 {0x8A0, 0x8B4}, {0x8B6, 0x8C7}, {0x904, 0x939}, {0x93D, 0x93D}, {0x950, 0x950}, {0x958, 0x961}, {0x971, 0x980}, {0x985, 0x98C}, {0x98F, 0x990}, {0x993, 0x9A8}, {0x9AA, 0x9B0}, {0x9B2, 0x9B2},
 {0x9B6, 0x9B9}, {0x9BD, 0x9BD}, {0x9CE, 0x9CE}, {0x9DC, 0x9DD}, {0x9DF, 0x9E1}, {0x9F0, 0x9F1}, {0x9FC, 0x9FC}, {0xA05, 0xA0A}, {0xA0F, 0xA10}, {0xA13, 0xA28}, {0xA2A, 0xA30}, {0xA32, 0xA33},
 {0xA35, 0xA36}, {0xA38, 0xA39}, {0xA59, 0xA5C}, {0xA5E, 0xA5E}, {0xA72, 0xA74}, {0xA85, 0xA8D}, {0xA8F, 0xA91}, {0xA93, 0xAA8}, {0xAAA, 0xAB0}, {0xAB2, 0xAB3}, {0xAB5, 0xAB9}, {0xABD, 0xABD},
 {0xAD0, 0xAD0}, {0xAE0, 0xAE1}, {0xAF9, 0xAF9}, {0xB05, 0xB0C}, {0xB0F, 0xB10}, {0xB13, 0xB28}, {0xB2A, 0xB30}, {0xB32, 0xB33}, {0xB35, 0xB39}, {0xB3D, 0xB3D}, {0xB5C, 0xB5D}, {0xB5F, 0xB61},
 {0xB71, 0xB71}, {0xB83, 0xB83}, {0xB85, 0xB8A}, {0xB8E, 0xB90}, {0xB92, 0xB95}, {0xB99, 0xB9A}, {0xB9C, 0xB9C}, {0xB9E, 0xB9F}, {0xBA3, 0xBA4}, {0xBA8, 0xBAA}, {0xBAE, 0xBB9}, {0xBD0, 0xBD0},
 {0xC05, 0xC0C}, {0xC0E, 0xC10}, {0xC12, 0xC28}, {0xC2A, 0xC39}, {0xC3D, 0xC3D}, {0xC58, 0xC5A}, {0xC60, 0xC61}, {0xC80, 0xC80}, {0xC85, 0xC8C}, {0xC8E, 0xC90}, {0xC92, 0xCA8}, {0xCAA, 0xCB3},
 {0xCB5, 0xCB9}, {0xCBD, 0xCBD}, {0xCDE, 0xCDE}, {0xCE0, 0xCE1}, {0xCF1, 0xCF2}, {0xD04, 0xD0C}, {0xD0E, 0xD10}, {0xD12, 0xD3A}, {0xD3D, 0xD3D}, {0xD4E, 0xD4E}, {0xD54, 0xD56}, {0xD5F, 0xD61},
 {0xD7A, 0xD7F}, {0xD85, 0xD96}, {0xD9A, 0xDB1}, {0xDB3, 0xDBB}, {0xDBD, 0xDBD}, {0xDC0, 0xDC6}, {0xE01, 0xE30}, {0xE32, 0xE33}, {0xE40, 0xE46}, {0xE81, 0xE82}, {0xE84, 0xE84}, {0xE86, 0xE8A},
 {0xE8C, 0xEA3}, {0xEA5, 0xEA5}, {0xEA7, 0xEB0}, {0xEB2, 0xEB3}, {0xEBD, 0xEBD}, {0xEC0, 0xEC4}, {0xEC6, 0xEC6}, {0xEDC, 0xEDF}, {0xF00, 0xF00}, {0xF40, 0xF47}, {0xF49, 0xF6C}, {0xF88, 0xF8C},
 {0x1000, 0x102A}, {0x103F, 0x103F}, {0x1050, 0x1055}, {0x105A, 0x105D}, {0x1061, 0x1061}, {0x1065, 0x1066}, {0x106E, 0x1070}, {0x1075, 0x1081}, {0x108E, 0x108E}, {0x10A0, 0x10C5}, {0x10C7, 0x10C7},
 {0x10CD, 0x10CD}, {0x10D0, 0x10FA}, {0x10FC, 0x1248}, {0x124A, 0x124D}, {0x1250, 0x1256}, {0x1258, 0x1258}, {0x125A, 0x125D}, {0x1260, 0x1288}, {0x128A, 0x128D}, {0x1290, 0x12B0}, {0x12B2, 0x12B5},
 {0x12B8, 0x12BE}, {0x12C0, 0x12C0}, {0x12C2, 0x12C5}, {0x12C8, 0x12D6}, {0x12D8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135A}, {0x1380, 0x138F}, {0x13A0, 0x13F5}, {0x13F8, 0x13FD}, {0x1401, 0x166C},
 {0x166F, 0x167F}, {0x1681, 0x169A}, {0x16A0, 0x16EA}, {0x16F1, 0x16F8}, {0x1700, 0x170C}, {0x170E, 0x1711}, {0x1720, 0x1731}, {0x1740, 0x1751}, {0x1760, 0x176C}, {0x176E, 0x1770}, {0x1780, 0x17B3},
 {0x17D7, 0x17D7}, {0x17DC, 0x17DC}, {0x1820, 0x1878}, {0x1880, 0x1884}, {0x1887, 0x18A8}, {0x18AA, 0x18AA}, {0x18B0, 0x18F5}, {0x1900, 0x191E}, {0x1950, 0x196D}, {0x1970, 0x1974}, {0x1980, 0x19AB},
 {0x19B0, 0x19C9}, {0x1A00, 0x1A16}, {0x1A20, 0x1A54}, {0x1AA7, 0x1AA7}, {0x1B05, 0x1B33}, {0x1B45, 0x1B4B}, {0x1B83, 0x1BA0}, {0x1BAE, 0x1BAF}, {0x1BBA, 0x1BE5}, {0x1C00, 0x1C23}, {0x1C4D, 0x1C4F},
 {0x1C5A, 0x1C7D}, {0x1C80, 0x1C88}, {0x1C90, 0x1CBA}, {0x1CBD, 0x1CBF}, {0x1CE9, 0x1CEC}, {0x1CEE, 0x1CF3}, {0x1CF5, 0x1CF6}, {0x1CFA, 0x1CFA}, {0x1D00, 0x1DBF}, {0x1E00, 0x1F15}, {0x1F18, 0x1F1D},
 {0x1F20, 0x1F45}, {0x1F48, 0x1F4D}, {0x1F50, 0x1F57}, {0x1F59, 0x1F59}, {0x1F5B, 0x1F5B}, {0x1F5D, 0x1F5D}, {0x1F5F, 0x1F7D}, {0x1F80, 0x1FB4}, {0x1FB6, 0x1FBC}, {0x1FBE, 0x1FBE}, {0x1FC2, 0x1FC4},
 {0x1FC6, 0x1FCC}, {0x1FD0, 0x1FD3}, {0x1FD6, 0x1FDB}, {0x1FE0, 0x1FEC}, {0x1FF2, 0x1FF4}, {0x1FF6, 0x1FFC}, {0x2071, 0x2071}, {0x207F, 0x207F}, {0x2090, 0x209C}, {0x2102, 0x2102}, {0x2107, 0x2107},
 {0x210A, 0x2113}, {0x2115, 0x2115}, {0x2119, 0x211D}, {0x2124, 0x2124}, {0x2126, 0x2126}, {0x2128, 0x2128}, {0x212A, 0x212D}, {0x212F, 0x2139}, {0x213C, 0x213F}, {0x2145, 0x2149}, {0x214E, 0x214E},
 {0x2183, 0x2184}, {0x2C00, 0x2C2E}, {0x2C30, 0x2C5E}, {0x2C60, 0x2CE4}, {0x2CEB, 0x2CEE}, {0x2CF2, 0x2CF3}, {0x2D00, 0x2D25}, {0x2D27, 0x2D27}, {0x2D2D, 0x2D2D}, {0x2D30, 0x2D67}, {0x2D6F, 0x2D6F},
 {0x2D80, 0x2D96}, {0x2DA0, 0x2DA6}, {0x2DA8, 0x2DAE}, {0x2DB0, 0x2DB6}, {0x2DB8, 0x2DBE}, {0x2DC0, 0x2DC6}, {0x2DC8, 0x2DCE}, {0x2DD0, 0x2DD6}, {0x2DD8, 0x2DDE}, {0x2E2F, 0x2E2F}, {0x3005, 0x3006},
 {0x3031, 0x3035}, {0x303B, 0x303C}, {0x3041, 0x3096}, {0x309D, 0x309F}, {0x30A1, 0x30FA}, {0x30FC, 0x30FF}, {0x3105, 0x312F}, {0x3131, 0x318E}, {0x31A0, 0x31BF}, {0x31F0, 0x31FF}, {0x3400, 0x4DBF},
 {0x4E00, 0x9FFC}, {0xA000, 0xA48C}, {0xA4D0, 0xA4FD}, {0xA500, 0xA60C}, {0xA610, 0xA61F}, {0xA62A, 0xA62B}, {0xA640, 0xA66E}, {0xA67F, 0xA69D}, {0xA6A0, 0xA6E5}, {0xA717, 0xA71F}, {0xA722, 0xA788},
 {0xA78B, 0xA7BF}, {0xA7C2, 0xA7CA}, {0xA7F5, 0xA801}, {0xA803, 0xA805}, {0xA807, 0xA80A}, {0xA80C, 0xA822}, {0xA840, 0xA873}, {0xA882, 0xA8B3}, {0xA8F2, 0xA8F7}, {0xA8FB, 0xA8FB}, {0xA8FD, 0xA8FE},
 {0xA90A, 0xA925}, {0xA930, 0xA946}, {0xA960, 0xA97C}, {0xA984, 0xA9B2}, {0xA9CF, 0xA9CF}, {0xA9E0, 0xA9E4}, {0xA9E6, 0xA9EF}, {0xA9FA, 0xA9FE}, {0xAA00, 0xAA28}, {0xAA40, 0xAA42}, {0xAA44, 0xAA4B},
 {0xAA60, 0xAA76}, {0xAA7A, 0xAA7A}, {0xAA7E, 0xAAAF}, {0xAAB1, 0xAAB1}, {0xAAB5, 0xAAB6}, {0xAAB9, 0xAABD}, {0xAAC0, 0xAAC0}, {0xAAC2, 0xAAC2}, {0xAADB, 0xAADD}, {0xAAE0, 0xAAEA}, {0xAAF2, 0xAAF4},
 {0xAB01, 0xAB06}, {0xAB09, 0xAB0E}, {0xAB11, 0xAB16}, {0xAB20, 0xAB26}, {0xAB28, 0xAB2E}, {0xAB30, 0xAB5A}, {0xAB5C, 0xAB69}, {0xAB70, 0xABE2}, {0xAC00, 0xD7A3}, {0xD7B0, 0xD7C6}, {0xD7CB, 0xD7FB},
 {0xF900, 0xFA6D}, {0xFA70, 0xFAD9}, {0xFB00, 0xFB06}, {0xFB13, 0xFB17}, {0xFB1D, 0xFB1D}, {0xFB1F, 0xFB28}, {0xFB2A, 0xFB36}, {0xFB38, 0xFB3C}, {0xFB3E, 0xFB3E}, {0xFB40, 0xFB41}, {0xFB43, 0xFB44},
 {0xFB46, 0xFBB1}, {0xFBD3, 0xFD3D}, {0xFD50, 0xFD8F}, {0xFD92, 0xFDC7}, {0xFDF0, 0xFDFB}, {0xFE70, 0xFE74}, {0xFE76, 0xFEFC}, {0xFF21, 0xFF3A}, {0xFF41, 0xFF5A}, {0xFF66, 0xFFBE}, {0xFFC2, 0xFFC7},
 {0xFFCA, 0xFFCF}, {0xFFD2, 0xFFD7}, {0xFFDA, 0xFFDC}, {0x10000, 0x1000B}, {0x1000D, 0x10026}, {0x10028, 0x1003A}, {0x1003C, 0x1003D}, {0x1003F, 0x1004D}, {0x10050, 0x1005D}, {0x10080, 0x100FA},
 {0x10280, 0x1029C}, {0x102A0, 0x102D0}, {0x10300, 0x1031F}, {0x1032D, 0x10340}, {0x10342, 0x10349}, {0x10350, 0x10375}, {0x10380, 0x1039D}, {0x103A0, 0x103C3}, {0x103C8, 0x103CF}, {0x10400, 0x1049D},
 {0x104B0, 0x104D3}, {0x104D8, 0x104FB}, {0x10500, 0x10527}, {0x10530, 0x10563}, {0x10600, 0x10736}, {0x10740, 0x10755}, {0x10760, 0x10767}, {0x10800, 0x10805}, {0x10808, 0x10808}, {0x1080A, 0x10835},
 {0x10837, 0x10838}, {0x1083C, 0x1083C}, {0x1083F, 0x10855}, {0x10860, 0x10876}, {0x10880, 0x1089E}, {0x108E0, 0x108F2}, {0x108F4, 0x108F5}, {0x10900, 0x10915}, {0x10920, 0x10939}, {0x10980, 0x109B7},
 {0x109BE, 0x109BF}, {0x10A00, 0x10A00}, {0x10A10, 0x10A13}, {0x10A15, 0x10A17}, {0x10A19, 0x10A35}, {0x10A60, 0x10A7C}, {0x10A80, 0x10A9C}, {0x10AC0, 0x10AC7}, {0x10AC9, 0x10AE4}, {0x10B00, 0x10B35},
 {0x10B40, 0x10B55}, {0x10B60, 0x10B72}, {0x10B80, 0x10B91}, {0x10C00, 0x10C48}, {0x10C80, 0x10CB2}, {0x10CC0, 0x10CF2}, {0x10D00, 0x10D23}, {0x10E80, 0x10EA9}, {0x10EB0, 0x10EB1}, {0x10F00, 0x10F1C},
 {0x10F27, 0x10F27}, {0x10F30, 0x10F45}, {0x10FB0, 0x10FC4}, {0x10FE0, 0x10FF6}, {0x11003, 0x11037}, {0x11083, 0x110AF}, {0x110D0, 0x110E8}, {0x11103, 0x11126}, {0x11144, 0x11144}, {0x11147, 0x11147},
 {0x11150, 0x11172}, {0x11176, 0x11176}, {0x11183, 0x111B2}, {0x111C1, 0x111C4}, {0x111DA, 0x111DA}, {0x111DC, 0x111DC}, {0x11200, 0x11211}, {0x11213, 0x1122B}, {0x11280, 0x11286}, {0x11288, 0x11288},
 {0x1128A, 0x1128D}, {0x1128F, 0x1129D}, {0x1129F, 0x112A8}, {0x112B0, 0x112DE}, {0x11305, 0x1130C}, {0x1130F, 0x11310}, {0x11313, 0x11328}, {0x1132A, 0x11330}, {0x11332, 0x11333}, {0x11335, 0x11339},
 {0x1133D, 0x1133D}, {0x11350, 0x11350}, {0x1135D, 0x11361}, {0x11400, 0x11434}, {0x11447, 0x1144A}, {0x1145F, 0x11461}, {0x11480, 0x114AF}, {0x114C4, 0x114C5}, {0x114C7, 0x114C7}, {0x11580, 0x115AE},
 {0x115D8, 0x115DB}, {0x11600, 0x1162F}, {0x11644, 0x11644}, {0x11680, 0x116AA}, {0x116B8, 0x116B8}, {0x11700, 0x1171A}, {0x11800, 0x1182B}, {0x118A0, 0x118DF}, {0x118FF, 0x11906}, {0x11909, 0x11909},
 {0x1190C, 0x11913}, {0x11915, 0x11916}, {0x11918, 0x1192F}, {0x1193F, 0x1193F}, {0x11941, 0x11941}, {0x119A0, 0x119A7}, {0x119AA, 0x119D0}, {0x119E1, 0x119E1}, {0x119E3, 0x119E3}, {0x11A00, 0x11A00},
 {0x11A0B, 0x11A32}, {0x11A3A, 0x11A3A}, {0x11A50, 0x11A50}, {0x11A5C, 0x11A89}, {0x11A9D, 0x11A9D}, {0x11AC0, 0x11AF8}, {0x11C00, 0x11C08}, {0x11C0A, 0x11C2E}, {0x11C40, 0x11C40}, {0x11C72, 0x11C8F},
 {0x11D00, 0x11D06}, {0x11D08, 0x11D09}, {0x11D0B, 0x11D30}, {0x11D46, 0x11D46}, {0x11D60, 0x11D65}, {0x11D67, 0x11D68}, {0x11D6A, 0x11D89}, {0x11D98, 0x11D98}, {0x11EE0, 0x11EF2}, {0x11FB0, 0x11FB0},
 {0x12000, 0x12399}, {0x12480, 0x12543}, {0x13000, 0x1342E}, {0x14400, 0x14646}, {0x16800, 0x16A38}, {0x16A40, 0x16A5E}, {0x16AD0, 0x16AED}, {0x16B00, 0x16B2F}, {0x16B40, 0x16B43}, {0x16B63, 0x16B77},
 {0x16B7D, 0x16B8F}, {0x16E40, 0x16E7F}, {0x16F00, 0x16F4A}, {0x16F50, 0x16F50}, {0x16F93, 0x16F9F}, {0x16FE0, 0x16FE1}, {0x16FE3, 0x16FE3}, {0x17000, 0x187F7}, {0x18800, 0x18CD5}, {0x18D00, 0x18D08},
 {0x1B000, 0x1B11E}, {0x1B150, 0x1B152}, {0x1B164, 0x1B167}, {0x1B170, 0x1B2FB}, {0x1BC00, 0x1BC6A}, {0x1BC70, 0x1BC7C}, {0x1BC80, 0x1BC88}, {0x1BC90, 0x1BC99}, {0x1D400, 0x1D454}, {0x1D456, 0x1D49C},
 {0x1D49E, 0x1D49F}, {0x1D4A2, 0x1D4A2}, {0x1D4A5, 0x1D4A6}, {0x1D4A9, 0x1D4AC}, {0x1D4AE, 0x1D4B9}, {0x1D4BB, 0x1D4BB}, {0x1D4BD, 0x1D4C3}, {0x1D4C5, 0x1D505}, {0x1D507, 0x1D50A}, {0x1D50D, 0x1D514},
 {0x1D516, 0x1D51C}, {0x1D51E, 0x1D539}, {0x1D53B, 0x1D53E}, {0x1D540, 0x1D544}, {0x1D546, 0x1D546}, {0x1D54A, 0x1D550}, {0x1D552, 0x1D6A5}, {0x1D6A8, 0x1D6C0}, {0x1D6C2, 0x1D6DA}, {0x1D6DC, 0x1D6FA},
 {0x1D6FC, 0x1D714}, {0x1D716, 0x1D734}, {0x1D736, 0x1D74E}, {0x1D750, 0x1D76E}, {0x1D770, 0x1D788}, {0x1D78A, 0x1D7A8}, {0x1D7AA, 0x1D7C2}, {0x1D7C4, 0x1D7CB}, {0x1E100, 0x1E12C}, {0x1E137, 0x1E13D},
 {0x1E14E, 0x1E14E}, {0x1E2C0, 0x1E2EB}, {0x1E800, 0x1E8C4}, {0x1E900, 0x1E943}, {0x1E94B, 0x1E94B}, {0x1EE00, 0x1EE03}, {0x1EE05, 0x1EE1F}, {0x1EE21, 0x1EE22}, {0x1EE24, 0x1EE24}, {0x1EE27, 0x1EE27},
 {0x1EE29, 0x1EE32}, {0x1EE34, 0x1EE37}, {0x1EE39, 0x1EE39}, {0x1EE3B, 0x1EE3B}, {0x1EE42, 0x1EE42}, {0x1EE47, 0x1EE47}, {0x1EE49, 0x1EE49}, {0x1EE4B, 0x1EE4B}, {0x1EE4D, 0x1EE4F}, {0x1EE51, 0x1EE52},
 {0x1EE54, 0x1EE54}, {0x1EE57, 0x1EE57}, {0x1EE59, 0x1EE59}, {0x1EE5B, 0x1EE5B}, {0x1EE5D, 0x1EE5D}, {0x1EE5F, 0x1EE5F}, {0x1EE61, 0x1EE62}, {0x1EE64, 0x1EE64}, {0x1EE67, 0x1EE6A}, {0x1EE6C, 0x1EE72},
 {0x1EE74, 0x1EE77}, {0x1EE79, 0x1EE7C}, {0x1EE7E, 0x1EE7E}, {0x1EE80, 0x1EE89}, {0x1EE8B, 0x1EE9B}, {0x1EEA1, 0x1EEA3}, {0x1EEA5, 0x1EEA9}, {0x1EEAB, 0x1EEBB}, {0x20000, 0x2A6DD}, {0x2A700, 0x2B734},
 {0x2B740, 0x2B81D}, {0x2B820, 0x2CEA1}, {0x2CEB0, 0x2EBE0}, {0x2F800, 0x2FA1D}, {0x30000, 0x3134A},
 };
-static const std::vector<std::pair<uint32_t, uint32_t>> whitespace_ranges = {
+std::string unicode_cpt_to_utf8(uint32_t cp);
-{0x9, 0xD}, {0x1C, 0x20}, {0x85, 0x85}, {0xA0, 0xA0}, {0x1680, 0x1680}, {0x2000, 0x200A}, {0x2028, 0x2029}, {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000},
+std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
 };
-static const std::vector<std::pair<uint32_t, uint32_t>> accent_mark_ranges = {
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
 {0x300, 0x36F}, {0x483, 0x489}, {0x591, 0x5BD}, {0x5BF, 0x5BF}, {0x5C1, 0x5C2}, {0x5C4, 0x5C5}, {0x5C7, 0x5C7}, {0x610, 0x61A}, {0x64B, 0x65F}, {0x670, 0x670}, {0x6D6, 0x6DC}, {0x6DF, 0x6E4},
 {0x6E7, 0x6E8}, {0x6EA, 0x6ED}, {0x711, 0x711}, {0x730, 0x74A}, {0x7A6, 0x7B0}, {0x7EB, 0x7F3}, {0x7FD, 0x7FD}, {0x816, 0x819}, {0x81B, 0x823}, {0x825, 0x827}, {0x829, 0x82D}, {0x859, 0x85B},
 {0x8D3, 0x8E1}, {0x8E3, 0x903}, {0x93A, 0x93C}, {0x93E, 0x94F}, {0x951, 0x957}, {0x962, 0x963}, {0x981, 0x983}, {0x9BC, 0x9BC}, {0x9BE, 0x9C4}, {0x9C7, 0x9C8}, {0x9CB, 0x9CD}, {0x9D7, 0x9D7},
 {0x9E2, 0x9E3}, {0x9FE, 0x9FE}, {0xA01, 0xA03}, {0xA3C, 0xA3C}, {0xA3E, 0xA42}, {0xA47, 0xA48}, {0xA4B, 0xA4D}, {0xA51, 0xA51}, {0xA70, 0xA71}, {0xA75, 0xA75}, {0xA81, 0xA83}, {0xABC, 0xABC},
 {0xABE, 0xAC5}, {0xAC7, 0xAC9}, {0xACB, 0xACD}, {0xAE2, 0xAE3}, {0xAFA, 0xAFF}, {0xB01, 0xB03}, {0xB3C, 0xB3C}, {0xB3E, 0xB44}, {0xB47, 0xB48}, {0xB4B, 0xB4D}, {0xB55, 0xB57}, {0xB62, 0xB63},
 {0xB82, 0xB82}, {0xBBE, 0xBC2}, {0xBC6, 0xBC8}, {0xBCA, 0xBCD}, {0xBD7, 0xBD7}, {0xC00, 0xC04}, {0xC3E, 0xC44}, {0xC46, 0xC48}, {0xC4A, 0xC4D}, {0xC55, 0xC56}, {0xC62, 0xC63}, {0xC81, 0xC83},
 {0xCBC, 0xCBC}, {0xCBE, 0xCC4}, {0xCC6, 0xCC8}, {0xCCA, 0xCCD}, {0xCD5, 0xCD6}, {0xCE2, 0xCE3}, {0xD00, 0xD03}, {0xD3B, 0xD3C}, {0xD3E, 0xD44}, {0xD46, 0xD48}, {0xD4A, 0xD4D}, {0xD57, 0xD57},
 {0xD62, 0xD63}, {0xD81, 0xD83}, {0xDCA, 0xDCA}, {0xDCF, 0xDD4}, {0xDD6, 0xDD6}, {0xDD8, 0xDDF}, {0xDF2, 0xDF3}, {0xE31, 0xE31}, {0xE34, 0xE3A}, {0xE47, 0xE4E}, {0xEB1, 0xEB1}, {0xEB4, 0xEBC},
 {0xEC8, 0xECD}, {0xF18, 0xF19}, {0xF35, 0xF35}, {0xF37, 0xF37}, {0xF39, 0xF39}, {0xF3E, 0xF3F}, {0xF71, 0xF84}, {0xF86, 0xF87}, {0xF8D, 0xF97}, {0xF99, 0xFBC}, {0xFC6, 0xFC6}, {0x102B, 0x103E},
 {0x1056, 0x1059}, {0x105E, 0x1060}, {0x1062, 0x1064}, {0x1067, 0x106D}, {0x1071, 0x1074}, {0x1082, 0x108D}, {0x108F, 0x108F}, {0x109A, 0x109D}, {0x135D, 0x135F}, {0x1712, 0x1714}, {0x1732, 0x1734},
 {0x1752, 0x1753}, {0x1772, 0x1773}, {0x17B4, 0x17D3}, {0x17DD, 0x17DD}, {0x180B, 0x180D}, {0x1885, 0x1886}, {0x18A9, 0x18A9}, {0x1920, 0x192B}, {0x1930, 0x193B}, {0x1A17, 0x1A1B}, {0x1A55, 0x1A5E},
 {0x1A60, 0x1A7C}, {0x1A7F, 0x1A7F}, {0x1AB0, 0x1AC0}, {0x1B00, 0x1B04}, {0x1B34, 0x1B44}, {0x1B6B, 0x1B73}, {0x1B80, 0x1B82}, {0x1BA1, 0x1BAD}, {0x1BE6, 0x1BF3}, {0x1C24, 0x1C37}, {0x1CD0, 0x1CD2},
 {0x1CD4, 0x1CE8}, {0x1CED, 0x1CED}, {0x1CF4, 0x1CF4}, {0x1CF7, 0x1CF9}, {0x1DC0, 0x1DF9}, {0x1DFB, 0x1DFF}, {0x20D0, 0x20F0}, {0x2CEF, 0x2CF1}, {0x2D7F, 0x2D7F}, {0x2DE0, 0x2DFF}, {0x302A, 0x302F},
 {0x3099, 0x309A}, {0xA66F, 0xA672}, {0xA674, 0xA67D}, {0xA69E, 0xA69F}, {0xA6F0, 0xA6F1}, {0xA802, 0xA802}, {0xA806, 0xA806}, {0xA80B, 0xA80B}, {0xA823, 0xA827}, {0xA82C, 0xA82C}, {0xA880, 0xA881},
 {0xA8B4, 0xA8C5}, {0xA8E0, 0xA8F1}, {0xA8FF, 0xA8FF}, {0xA926, 0xA92D}, {0xA947, 0xA953}, {0xA980, 0xA983}, {0xA9B3, 0xA9C0}, {0xA9E5, 0xA9E5}, {0xAA29, 0xAA36}, {0xAA43, 0xAA43}, {0xAA4C, 0xAA4D},
 {0xAA7B, 0xAA7D}, {0xAAB0, 0xAAB0}, {0xAAB2, 0xAAB4}, {0xAAB7, 0xAAB8}, {0xAABE, 0xAABF}, {0xAAC1, 0xAAC1}, {0xAAEB, 0xAAEF}, {0xAAF5, 0xAAF6}, {0xABE3, 0xABEA}, {0xABEC, 0xABED}, {0xFB1E, 0xFB1E},
 {0xFE00, 0xFE0F}, {0xFE20, 0xFE2F}, {0x101FD, 0x101FD}, {0x102E0, 0x102E0}, {0x10376, 0x1037A}, {0x10A01, 0x10A03}, {0x10A05, 0x10A06}, {0x10A0C, 0x10A0F}, {0x10A38, 0x10A3A}, {0x10A3F, 0x10A3F},
 {0x10AE5, 0x10AE6}, {0x10D24, 0x10D27}, {0x10EAB, 0x10EAC}, {0x10F46, 0x10F50}, {0x11000, 0x11002}, {0x11038, 0x11046}, {0x1107F, 0x11082}, {0x110B0, 0x110BA}, {0x11100, 0x11102}, {0x11127, 0x11134},
 {0x11145, 0x11146}, {0x11173, 0x11173}, {0x11180, 0x11182}, {0x111B3, 0x111C0}, {0x111C9, 0x111CC}, {0x111CE, 0x111CF}, {0x1122C, 0x11237}, {0x1123E, 0x1123E}, {0x112DF, 0x112EA}, {0x11300, 0x11303},
 {0x1133B, 0x1133C}, {0x1133E, 0x11344}, {0x11347, 0x11348}, {0x1134B, 0x1134D}, {0x11357, 0x11357}, {0x11362, 0x11363}, {0x11366, 0x1136C}, {0x11370, 0x11374}, {0x11435, 0x11446}, {0x1145E, 0x1145E},
 {0x114B0, 0x114C3}, {0x115AF, 0x115B5}, {0x115B8, 0x115C0}, {0x115DC, 0x115DD}, {0x11630, 0x11640}, {0x116AB, 0x116B7}, {0x1171D, 0x1172B}, {0x1182C, 0x1183A}, {0x11930, 0x11935}, {0x11937, 0x11938},
 {0x1193B, 0x1193E}, {0x11940, 0x11940}, {0x11942, 0x11943}, {0x119D1, 0x119D7}, {0x119DA, 0x119E0}, {0x119E4, 0x119E4}, {0x11A01, 0x11A0A}, {0x11A33, 0x11A39}, {0x11A3B, 0x11A3E}, {0x11A47, 0x11A47},
 {0x11A51, 0x11A5B}, {0x11A8A, 0x11A99}, {0x11C2F, 0x11C36}, {0x11C38, 0x11C3F}, {0x11C92, 0x11CA7}, {0x11CA9, 0x11CB6}, {0x11D31, 0x11D36}, {0x11D3A, 0x11D3A}, {0x11D3C, 0x11D3D}, {0x11D3F, 0x11D45},
 {0x11D47, 0x11D47}, {0x11D8A, 0x11D8E}, {0x11D90, 0x11D91}, {0x11D93, 0x11D97}, {0x11EF3, 0x11EF6}, {0x16AF0, 0x16AF4}, {0x16B30, 0x16B36}, {0x16F4F, 0x16F4F}, {0x16F51, 0x16F87}, {0x16F8F, 0x16F92},
 {0x16FE4, 0x16FE4}, {0x16FF0, 0x16FF1}, {0x1BC9D, 0x1BC9E}, {0x1D165, 0x1D169}, {0x1D16D, 0x1D172}, {0x1D17B, 0x1D182}, {0x1D185, 0x1D18B}, {0x1D1AA, 0x1D1AD}, {0x1D242, 0x1D244}, {0x1DA00, 0x1DA36},
 {0x1DA3B, 0x1DA6C}, {0x1DA75, 0x1DA75}, {0x1DA84, 0x1DA84}, {0x1DA9B, 0x1DA9F}, {0x1DAA1, 0x1DAAF}, {0x1E000, 0x1E006}, {0x1E008, 0x1E018}, {0x1E01B, 0x1E021}, {0x1E023, 0x1E024}, {0x1E026, 0x1E02A},
 {0x1E130, 0x1E136}, {0x1E2EC, 0x1E2EF}, {0x1E8D0, 0x1E8D6}, {0x1E944, 0x1E94A}, {0xE0100, 0xE01EF},
 };
-static const std::vector<std::pair<uint32_t, uint32_t>> punctuation_ranges = {
+codepoint_flags unicode_cpt_flags(const uint32_t cp);
-{0x21, 0x23}, {0x25, 0x2A}, {0x2C, 0x2F}, {0x3A, 0x3B}, {0x3F, 0x40}, {0x5B, 0x5D}, {0x5F, 0x5F}, {0x7B, 0x7B}, {0x7D, 0x7D}, {0xA1, 0xA1}, {0xA7, 0xA7}, {0xAB, 0xAB}, {0xB6, 0xB7}, {0xBB, 0xBB},
+codepoint_flags unicode_cpt_flags(const std::string & utf8);
 {0xBF, 0xBF}, {0x37E, 0x37E}, {0x387, 0x387}, {0x55A, 0x55F}, {0x589, 0x58A}, {0x5BE, 0x5BE}, {0x5C0, 0x5C0}, {0x5C3, 0x5C3}, {0x5C6, 0x5C6}, {0x5F3, 0x5F4}, {0x609, 0x60A}, {0x60C, 0x60D},
 {0x61B, 0x61B}, {0x61E, 0x61F}, {0x66A, 0x66D}, {0x6D4, 0x6D4}, {0x700, 0x70D}, {0x7F7, 0x7F9}, {0x830, 0x83E}, {0x85E, 0x85E}, {0x964, 0x965}, {0x970, 0x970}, {0x9FD, 0x9FD}, {0xA76, 0xA76},
 {0xAF0, 0xAF0}, {0xC77, 0xC77}, {0xC84, 0xC84}, {0xDF4, 0xDF4}, {0xE4F, 0xE4F}, {0xE5A, 0xE5B}, {0xF04, 0xF12}, {0xF14, 0xF14}, {0xF3A, 0xF3D}, {0xF85, 0xF85}, {0xFD0, 0xFD4}, {0xFD9, 0xFDA},
 {0x104A, 0x104F}, {0x10FB, 0x10FB}, {0x1360, 0x1368}, {0x1400, 0x1400}, {0x166E, 0x166E}, {0x169B, 0x169C}, {0x16EB, 0x16ED}, {0x1735, 0x1736}, {0x17D4, 0x17D6}, {0x17D8, 0x17DA}, {0x1800, 0x180A},
 {0x1944, 0x1945}, {0x1A1E, 0x1A1F}, {0x1AA0, 0x1AA6}, {0x1AA8, 0x1AAD}, {0x1B5A, 0x1B60}, {0x1BFC, 0x1BFF}, {0x1C3B, 0x1C3F}, {0x1C7E, 0x1C7F}, {0x1CC0, 0x1CC7}, {0x1CD3, 0x1CD3}, {0x2010, 0x2027},
 {0x2030, 0x2043}, {0x2045, 0x2051}, {0x2053, 0x205E}, {0x207D, 0x207E}, {0x208D, 0x208E}, {0x2308, 0x230B}, {0x2329, 0x232A}, {0x2768, 0x2775}, {0x27C5, 0x27C6}, {0x27E6, 0x27EF}, {0x2983, 0x2998},
 {0x29D8, 0x29DB}, {0x29FC, 0x29FD}, {0x2CF9, 0x2CFC}, {0x2CFE, 0x2CFF}, {0x2D70, 0x2D70}, {0x2E00, 0x2E2E}, {0x2E30, 0x2E4F}, {0x2E52, 0x2E52}, {0x3001, 0x3003}, {0x3008, 0x3011}, {0x3014, 0x301F},
 {0x3030, 0x3030}, {0x303D, 0x303D}, {0x30A0, 0x30A0}, {0x30FB, 0x30FB}, {0xA4FE, 0xA4FF}, {0xA60D, 0xA60F}, {0xA673, 0xA673}, {0xA67E, 0xA67E}, {0xA6F2, 0xA6F7}, {0xA874, 0xA877}, {0xA8CE, 0xA8CF},
 {0xA8F8, 0xA8FA}, {0xA8FC, 0xA8FC}, {0xA92E, 0xA92F}, {0xA95F, 0xA95F}, {0xA9C1, 0xA9CD}, {0xA9DE, 0xA9DF}, {0xAA5C, 0xAA5F}, {0xAADE, 0xAADF}, {0xAAF0, 0xAAF1}, {0xABEB, 0xABEB}, {0xFD3E, 0xFD3F},
 {0xFE10, 0xFE19}, {0xFE30, 0xFE52}, {0xFE54, 0xFE61}, {0xFE63, 0xFE63}, {0xFE68, 0xFE68}, {0xFE6A, 0xFE6B}, {0xFF01, 0xFF03}, {0xFF05, 0xFF0A}, {0xFF0C, 0xFF0F}, {0xFF1A, 0xFF1B}, {0xFF1F, 0xFF20},
 {0xFF3B, 0xFF3D}, {0xFF3F, 0xFF3F}, {0xFF5B, 0xFF5B}, {0xFF5D, 0xFF5D}, {0xFF5F, 0xFF65}, {0x10100, 0x10102}, {0x1039F, 0x1039F}, {0x103D0, 0x103D0}, {0x1056F, 0x1056F}, {0x10857, 0x10857},
 {0x1091F, 0x1091F}, {0x1093F, 0x1093F}, {0x10A50, 0x10A58}, {0x10A7F, 0x10A7F}, {0x10AF0, 0x10AF6}, {0x10B39, 0x10B3F}, {0x10B99, 0x10B9C}, {0x10EAD, 0x10EAD}, {0x10F55, 0x10F59}, {0x11047, 0x1104D},
 {0x110BB, 0x110BC}, {0x110BE, 0x110C1}, {0x11140, 0x11143}, {0x11174, 0x11175}, {0x111C5, 0x111C8}, {0x111CD, 0x111CD}, {0x111DB, 0x111DB}, {0x111DD, 0x111DF}, {0x11238, 0x1123D}, {0x112A9, 0x112A9},
 {0x1144B, 0x1144F}, {0x1145A, 0x1145B}, {0x1145D, 0x1145D}, {0x114C6, 0x114C6}, {0x115C1, 0x115D7}, {0x11641, 0x11643}, {0x11660, 0x1166C}, {0x1173C, 0x1173E}, {0x1183B, 0x1183B}, {0x11944, 0x11946},
 {0x119E2, 0x119E2}, {0x11A3F, 0x11A46}, {0x11A9A, 0x11A9C}, {0x11A9E, 0x11AA2}, {0x11C41, 0x11C45}, {0x11C70, 0x11C71}, {0x11EF7, 0x11EF8}, {0x11FFF, 0x11FFF}, {0x12470, 0x12474}, {0x16A6E, 0x16A6F},
 {0x16AF5, 0x16AF5}, {0x16B37, 0x16B3B}, {0x16B44, 0x16B44}, {0x16E97, 0x16E9A}, {0x16FE2, 0x16FE2}, {0x1BC9F, 0x1BC9F}, {0x1DA87, 0x1DA8B}, {0x1E95E, 0x1E95F},
 };
-static const std::vector<std::pair<uint32_t, uint32_t>> symbol_ranges = {
+std::string unicode_byte_to_utf8(uint8_t byte);
-{0x24, 0x24}, {0x2B, 0x2B}, {0x3C, 0x3E}, {0x5E, 0x5E}, {0x60, 0x60}, {0x7C, 0x7C}, {0x7E, 0x7E}, {0xA2, 0xA6}, {0xA8, 0xA9}, {0xAC, 0xAC}, {0xAE, 0xB1}, {0xB4, 0xB4}, {0xB8, 0xB8}, {0xD7, 0xD7},
+uint8_t unicode_utf8_to_byte(const std::string & utf8);
 {0xF7, 0xF7}, {0x2C2, 0x2C5}, {0x2D2, 0x2DF}, {0x2E5, 0x2EB}, {0x2ED, 0x2ED}, {0x2EF, 0x2FF}, {0x375, 0x375}, {0x384, 0x385}, {0x3F6, 0x3F6}, {0x482, 0x482}, {0x58D, 0x58F}, {0x606, 0x608},
 {0x60B, 0x60B}, {0x60E, 0x60F}, {0x6DE, 0x6DE}, {0x6E9, 0x6E9}, {0x6FD, 0x6FE}, {0x7F6, 0x7F6}, {0x7FE, 0x7FF}, {0x9F2, 0x9F3}, {0x9FA, 0x9FB}, {0xAF1, 0xAF1}, {0xB70, 0xB70}, {0xBF3, 0xBFA},
 {0xC7F, 0xC7F}, {0xD4F, 0xD4F}, {0xD79, 0xD79}, {0xE3F, 0xE3F}, {0xF01, 0xF03}, {0xF13, 0xF13}, {0xF15, 0xF17}, {0xF1A, 0xF1F}, {0xF34, 0xF34}, {0xF36, 0xF36}, {0xF38, 0xF38}, {0xFBE, 0xFC5},
 {0xFC7, 0xFCC}, {0xFCE, 0xFCF}, {0xFD5, 0xFD8}, {0x109E, 0x109F}, {0x1390, 0x1399}, {0x166D, 0x166D}, {0x17DB, 0x17DB}, {0x1940, 0x1940}, {0x19DE, 0x19FF}, {0x1B61, 0x1B6A}, {0x1B74, 0x1B7C},
 {0x1FBD, 0x1FBD}, {0x1FBF, 0x1FC1}, {0x1FCD, 0x1FCF}, {0x1FDD, 0x1FDF}, {0x1FED, 0x1FEF}, {0x1FFD, 0x1FFE}, {0x2044, 0x2044}, {0x2052, 0x2052}, {0x207A, 0x207C}, {0x208A, 0x208C}, {0x20A0, 0x20BF},
 {0x2100, 0x2101}, {0x2103, 0x2106}, {0x2108, 0x2109}, {0x2114, 0x2114}, {0x2116, 0x2118}, {0x211E, 0x2123}, {0x2125, 0x2125}, {0x2127, 0x2127}, {0x2129, 0x2129}, {0x212E, 0x212E}, {0x213A, 0x213B},
 {0x2140, 0x2144}, {0x214A, 0x214D}, {0x214F, 0x214F}, {0x218A, 0x218B}, {0x2190, 0x2307}, {0x230C, 0x2328}, {0x232B, 0x2426}, {0x2440, 0x244A}, {0x249C, 0x24E9}, {0x2500, 0x2767}, {0x2794, 0x27C4},
 {0x27C7, 0x27E5}, {0x27F0, 0x2982}, {0x2999, 0x29D7}, {0x29DC, 0x29FB}, {0x29FE, 0x2B73}, {0x2B76, 0x2B95}, {0x2B97, 0x2BFF}, {0x2CE5, 0x2CEA}, {0x2E50, 0x2E51}, {0x2E80, 0x2E99}, {0x2E9B, 0x2EF3},
 {0x2F00, 0x2FD5}, {0x2FF0, 0x2FFB}, {0x3004, 0x3004}, {0x3012, 0x3013}, {0x3020, 0x3020}, {0x3036, 0x3037}, {0x303E, 0x303F}, {0x309B, 0x309C}, {0x3190, 0x3191}, {0x3196, 0x319F}, {0x31C0, 0x31E3},
 {0x3200, 0x321E}, {0x322A, 0x3247}, {0x3250, 0x3250}, {0x3260, 0x327F}, {0x328A, 0x32B0}, {0x32C0, 0x33FF}, {0x4DC0, 0x4DFF}, {0xA490, 0xA4C6}, {0xA700, 0xA716}, {0xA720, 0xA721}, {0xA789, 0xA78A},
 {0xA828, 0xA82B}, {0xA836, 0xA839}, {0xAA77, 0xAA79}, {0xAB5B, 0xAB5B}, {0xAB6A, 0xAB6B}, {0xFB29, 0xFB29}, {0xFBB2, 0xFBC1}, {0xFDFC, 0xFDFD}, {0xFE62, 0xFE62}, {0xFE64, 0xFE66}, {0xFE69, 0xFE69},
 {0xFF04, 0xFF04}, {0xFF0B, 0xFF0B}, {0xFF1C, 0xFF1E}, {0xFF3E, 0xFF3E}, {0xFF40, 0xFF40}, {0xFF5C, 0xFF5C}, {0xFF5E, 0xFF5E}, {0xFFE0, 0xFFE6}, {0xFFE8, 0xFFEE}, {0xFFFC, 0xFFFD}, {0x10137, 0x1013F},
 {0x10179, 0x10189}, {0x1018C, 0x1018E}, {0x10190, 0x1019C}, {0x101A0, 0x101A0}, {0x101D0, 0x101FC}, {0x10877, 0x10878}, {0x10AC8, 0x10AC8}, {0x1173F, 0x1173F}, {0x11FD5, 0x11FF1}, {0x16B3C, 0x16B3F},
 {0x16B45, 0x16B45}, {0x1BC9C, 0x1BC9C}, {0x1D000, 0x1D0F5}, {0x1D100, 0x1D126}, {0x1D129, 0x1D164}, {0x1D16A, 0x1D16C}, {0x1D183, 0x1D184}, {0x1D18C, 0x1D1A9}, {0x1D1AE, 0x1D1E8}, {0x1D200, 0x1D241},
 {0x1D245, 0x1D245}, {0x1D300, 0x1D356}, {0x1D6C1, 0x1D6C1}, {0x1D6DB, 0x1D6DB}, {0x1D6FB, 0x1D6FB}, {0x1D715, 0x1D715}, {0x1D735, 0x1D735}, {0x1D74F, 0x1D74F}, {0x1D76F, 0x1D76F}, {0x1D789, 0x1D789},
 {0x1D7A9, 0x1D7A9}, {0x1D7C3, 0x1D7C3}, {0x1D800, 0x1D9FF}, {0x1DA37, 0x1DA3A}, {0x1DA6D, 0x1DA74}, {0x1DA76, 0x1DA83}, {0x1DA85, 0x1DA86}, {0x1E14F, 0x1E14F}, {0x1E2FF, 0x1E2FF}, {0x1ECAC, 0x1ECAC},
 {0x1ECB0, 0x1ECB0}, {0x1ED2E, 0x1ED2E}, {0x1EEF0, 0x1EEF1}, {0x1F000, 0x1F02B}, {0x1F030, 0x1F093}, {0x1F0A0, 0x1F0AE}, {0x1F0B1, 0x1F0BF}, {0x1F0C1, 0x1F0CF}, {0x1F0D1, 0x1F0F5}, {0x1F10D, 0x1F1AD},
 {0x1F1E6, 0x1F202}, {0x1F210, 0x1F23B}, {0x1F240, 0x1F248}, {0x1F250, 0x1F251}, {0x1F260, 0x1F265}, {0x1F300, 0x1F6D7}, {0x1F6E0, 0x1F6EC}, {0x1F6F0, 0x1F6FC}, {0x1F700, 0x1F773}, {0x1F780, 0x1F7D8},
 {0x1F7E0, 0x1F7EB}, {0x1F800, 0x1F80B}, {0x1F810, 0x1F847}, {0x1F850, 0x1F859}, {0x1F860, 0x1F887}, {0x1F890, 0x1F8AD}, {0x1F8B0, 0x1F8B1}, {0x1F900, 0x1F978}, {0x1F97A, 0x1F9CB}, {0x1F9CD, 0x1FA53},
 {0x1FA60, 0x1FA6D}, {0x1FA70, 0x1FA74}, {0x1FA78, 0x1FA7A}, {0x1FA80, 0x1FA86}, {0x1FA90, 0x1FAA8}, {0x1FAB0, 0x1FAB6}, {0x1FAC0, 0x1FAC2}, {0x1FAD0, 0x1FAD6}, {0x1FB00, 0x1FB92}, {0x1FB94, 0x1FBCA},
 };
-static const std::vector<std::pair<uint32_t, uint32_t>> control_ranges = {
+uint32_t unicode_tolower(uint32_t cp);
 {0x0, 0x8}, {0xE, 0x1B}, {0x7F, 0x84}, {0x86, 0x9F}, {0xAD, 0xAD}, {0x378, 0x379}, {0x380, 0x383}, {0x38B, 0x38B}, {0x38D, 0x38D}, {0x3A2, 0x3A2}, {0x530, 0x530}, {0x557, 0x558}, {0x58B, 0x58C},
 {0x590, 0x590}, {0x5C8, 0x5CF}, {0x5EB, 0x5EE}, {0x5F5, 0x605}, {0x61C, 0x61D}, {0x6DD, 0x6DD}, {0x70E, 0x70F}, {0x74B, 0x74C}, {0x7B2, 0x7BF}, {0x7FB, 0x7FC}, {0x82E, 0x82F}, {0x83F, 0x83F},
 {0x85C, 0x85D}, {0x85F, 0x85F}, {0x86B, 0x89F}, {0x8B5, 0x8B5}, {0x8C8, 0x8D2}, {0x8E2, 0x8E2}, {0x984, 0x984}, {0x98D, 0x98E}, {0x991, 0x992}, {0x9A9, 0x9A9}, {0x9B1, 0x9B1}, {0x9B3, 0x9B5},
 {0x9BA, 0x9BB}, {0x9C5, 0x9C6}, {0x9C9, 0x9CA}, {0x9CF, 0x9D6}, {0x9D8, 0x9DB}, {0x9DE, 0x9DE}, {0x9E4, 0x9E5}, {0x9FF, 0xA00}, {0xA04, 0xA04}, {0xA0B, 0xA0E}, {0xA11, 0xA12}, {0xA29, 0xA29},
 {0xA31, 0xA31}, {0xA34, 0xA34}, {0xA37, 0xA37}, {0xA3A, 0xA3B}, {0xA3D, 0xA3D}, {0xA43, 0xA46}, {0xA49, 0xA4A}, {0xA4E, 0xA50}, {0xA52, 0xA58}, {0xA5D, 0xA5D}, {0xA5F, 0xA65}, {0xA77, 0xA80},
 {0xA84, 0xA84}, {0xA8E, 0xA8E}, {0xA92, 0xA92}, {0xAA9, 0xAA9}, {0xAB1, 0xAB1}, {0xAB4, 0xAB4}, {0xABA, 0xABB}, {0xAC6, 0xAC6}, {0xACA, 0xACA}, {0xACE, 0xACF}, {0xAD1, 0xADF}, {0xAE4, 0xAE5},
 {0xAF2, 0xAF8}, {0xB00, 0xB00}, {0xB04, 0xB04}, {0xB0D, 0xB0E}, {0xB11, 0xB12}, {0xB29, 0xB29}, {0xB31, 0xB31}, {0xB34, 0xB34}, {0xB3A, 0xB3B}, {0xB45, 0xB46}, {0xB49, 0xB4A}, {0xB4E, 0xB54},
 {0xB58, 0xB5B}, {0xB5E, 0xB5E}, {0xB64, 0xB65}, {0xB78, 0xB81}, {0xB84, 0xB84}, {0xB8B, 0xB8D}, {0xB91, 0xB91}, {0xB96, 0xB98}, {0xB9B, 0xB9B}, {0xB9D, 0xB9D}, {0xBA0, 0xBA2}, {0xBA5, 0xBA7},
 {0xBAB, 0xBAD}, {0xBBA, 0xBBD}, {0xBC3, 0xBC5}, {0xBC9, 0xBC9}, {0xBCE, 0xBCF}, {0xBD1, 0xBD6}, {0xBD8, 0xBE5}, {0xBFB, 0xBFF}, {0xC0D, 0xC0D}, {0xC11, 0xC11}, {0xC29, 0xC29}, {0xC3A, 0xC3C},
 {0xC45, 0xC45}, {0xC49, 0xC49}, {0xC4E, 0xC54}, {0xC57, 0xC57}, {0xC5B, 0xC5F}, {0xC64, 0xC65}, {0xC70, 0xC76}, {0xC8D, 0xC8D}, {0xC91, 0xC91}, {0xCA9, 0xCA9}, {0xCB4, 0xCB4}, {0xCBA, 0xCBB},
 {0xCC5, 0xCC5}, {0xCC9, 0xCC9}, {0xCCE, 0xCD4}, {0xCD7, 0xCDD}, {0xCDF, 0xCDF}, {0xCE4, 0xCE5}, {0xCF0, 0xCF0}, {0xCF3, 0xCFF}, {0xD0D, 0xD0D}, {0xD11, 0xD11}, {0xD45, 0xD45}, {0xD49, 0xD49},
 {0xD50, 0xD53}, {0xD64, 0xD65}, {0xD80, 0xD80}, {0xD84, 0xD84}, {0xD97, 0xD99}, {0xDB2, 0xDB2}, {0xDBC, 0xDBC}, {0xDBE, 0xDBF}, {0xDC7, 0xDC9}, {0xDCB, 0xDCE}, {0xDD5, 0xDD5}, {0xDD7, 0xDD7},
 {0xDE0, 0xDE5}, {0xDF0, 0xDF1}, {0xDF5, 0xE00}, {0xE3B, 0xE3E}, {0xE5C, 0xE80}, {0xE83, 0xE83}, {0xE85, 0xE85}, {0xE8B, 0xE8B}, {0xEA4, 0xEA4}, {0xEA6, 0xEA6}, {0xEBE, 0xEBF}, {0xEC5, 0xEC5},
 {0xEC7, 0xEC7}, {0xECE, 0xECF}, {0xEDA, 0xEDB}, {0xEE0, 0xEFF}, {0xF48, 0xF48}, {0xF6D, 0xF70}, {0xF98, 0xF98}, {0xFBD, 0xFBD}, {0xFCD, 0xFCD}, {0xFDB, 0xFFF}, {0x10C6, 0x10C6}, {0x10C8, 0x10CC},
 {0x10CE, 0x10CF}, {0x1249, 0x1249}, {0x124E, 0x124F}, {0x1257, 0x1257}, {0x1259, 0x1259}, {0x125E, 0x125F}, {0x1289, 0x1289}, {0x128E, 0x128F}, {0x12B1, 0x12B1}, {0x12B6, 0x12B7}, {0x12BF, 0x12BF},
 {0x12C1, 0x12C1}, {0x12C6, 0x12C7}, {0x12D7, 0x12D7}, {0x1311, 0x1311}, {0x1316, 0x1317}, {0x135B, 0x135C}, {0x137D, 0x137F}, {0x139A, 0x139F}, {0x13F6, 0x13F7}, {0x13FE, 0x13FF}, {0x169D, 0x169F},
 {0x16F9, 0x16FF}, {0x170D, 0x170D}, {0x1715, 0x171F}, {0x1737, 0x173F}, {0x1754, 0x175F}, {0x176D, 0x176D}, {0x1771, 0x1771}, {0x1774, 0x177F}, {0x17DE, 0x17DF}, {0x17EA, 0x17EF}, {0x17FA, 0x17FF},
 {0x180E, 0x180F}, {0x181A, 0x181F}, {0x1879, 0x187F}, {0x18AB, 0x18AF}, {0x18F6, 0x18FF}, {0x191F, 0x191F}, {0x192C, 0x192F}, {0x193C, 0x193F}, {0x1941, 0x1943}, {0x196E, 0x196F}, {0x1975, 0x197F},
 {0x19AC, 0x19AF}, {0x19CA, 0x19CF}, {0x19DB, 0x19DD}, {0x1A1C, 0x1A1D}, {0x1A5F, 0x1A5F}, {0x1A7D, 0x1A7E}, {0x1A8A, 0x1A8F}, {0x1A9A, 0x1A9F}, {0x1AAE, 0x1AAF}, {0x1AC1, 0x1AFF}, {0x1B4C, 0x1B4F},
 {0x1B7D, 0x1B7F}, {0x1BF4, 0x1BFB}, {0x1C38, 0x1C3A}, {0x1C4A, 0x1C4C}, {0x1C89, 0x1C8F}, {0x1CBB, 0x1CBC}, {0x1CC8, 0x1CCF}, {0x1CFB, 0x1CFF}, {0x1DFA, 0x1DFA}, {0x1F16, 0x1F17}, {0x1F1E, 0x1F1F},
 {0x1F46, 0x1F47}, {0x1F4E, 0x1F4F}, {0x1F58, 0x1F58}, {0x1F5A, 0x1F5A}, {0x1F5C, 0x1F5C}, {0x1F5E, 0x1F5E}, {0x1F7E, 0x1F7F}, {0x1FB5, 0x1FB5}, {0x1FC5, 0x1FC5}, {0x1FD4, 0x1FD5}, {0x1FDC, 0x1FDC},
 {0x1FF0, 0x1FF1}, {0x1FF5, 0x1FF5}, {0x1FFF, 0x1FFF}, {0x200B, 0x200F}, {0x202A, 0x202E}, {0x2060, 0x206F}, {0x2072, 0x2073}, {0x208F, 0x208F}, {0x209D, 0x209F}, {0x20C0, 0x20CF}, {0x20F1, 0x20FF},
 {0x218C, 0x218F}, {0x2427, 0x243F}, {0x244B, 0x245F}, {0x2B74, 0x2B75}, {0x2B96, 0x2B96}, {0x2C2F, 0x2C2F}, {0x2C5F, 0x2C5F}, {0x2CF4, 0x2CF8}, {0x2D26, 0x2D26}, {0x2D28, 0x2D2C}, {0x2D2E, 0x2D2F},
 {0x2D68, 0x2D6E}, {0x2D71, 0x2D7E}, {0x2D97, 0x2D9F}, {0x2DA7, 0x2DA7}, {0x2DAF, 0x2DAF}, {0x2DB7, 0x2DB7}, {0x2DBF, 0x2DBF}, {0x2DC7, 0x2DC7}, {0x2DCF, 0x2DCF}, {0x2DD7, 0x2DD7}, {0x2DDF, 0x2DDF},
 {0x2E53, 0x2E7F}, {0x2E9A, 0x2E9A}, {0x2EF4, 0x2EFF}, {0x2FD6, 0x2FEF}, {0x2FFC, 0x2FFF}, {0x3040, 0x3040}, {0x3097, 0x3098}, {0x3100, 0x3104}, {0x3130, 0x3130}, {0x318F, 0x318F}, {0x31E4, 0x31EF},
 {0x321F, 0x321F}, {0x9FFD, 0x9FFF}, {0xA48D, 0xA48F}, {0xA4C7, 0xA4CF}, {0xA62C, 0xA63F}, {0xA6F8, 0xA6FF}, {0xA7C0, 0xA7C1}, {0xA7CB, 0xA7F4}, {0xA82D, 0xA82F}, {0xA83A, 0xA83F}, {0xA878, 0xA87F},
 {0xA8C6, 0xA8CD}, {0xA8DA, 0xA8DF}, {0xA954, 0xA95E}, {0xA97D, 0xA97F}, {0xA9CE, 0xA9CE}, {0xA9DA, 0xA9DD}, {0xA9FF, 0xA9FF}, {0xAA37, 0xAA3F}, {0xAA4E, 0xAA4F}, {0xAA5A, 0xAA5B}, {0xAAC3, 0xAADA},
 {0xAAF7, 0xAB00}, {0xAB07, 0xAB08}, {0xAB0F, 0xAB10}, {0xAB17, 0xAB1F}, {0xAB27, 0xAB27}, {0xAB2F, 0xAB2F}, {0xAB6C, 0xAB6F}, {0xABEE, 0xABEF}, {0xABFA, 0xABFF}, {0xD7A4, 0xD7AF}, {0xD7C7, 0xD7CA},
 {0xD7FC, 0xF8FF}, {0xFA6E, 0xFA6F}, {0xFADA, 0xFAFF}, {0xFB07, 0xFB12}, {0xFB18, 0xFB1C}, {0xFB37, 0xFB37}, {0xFB3D, 0xFB3D}, {0xFB3F, 0xFB3F}, {0xFB42, 0xFB42}, {0xFB45, 0xFB45}, {0xFBC2, 0xFBD2},
 {0xFD40, 0xFD4F}, {0xFD90, 0xFD91}, {0xFDC8, 0xFDEF}, {0xFDFE, 0xFDFF}, {0xFE1A, 0xFE1F}, {0xFE53, 0xFE53}, {0xFE67, 0xFE67}, {0xFE6C, 0xFE6F}, {0xFE75, 0xFE75}, {0xFEFD, 0xFF00}, {0xFFBF, 0xFFC1},
 {0xFFC8, 0xFFC9}, {0xFFD0, 0xFFD1}, {0xFFD8, 0xFFD9}, {0xFFDD, 0xFFDF}, {0xFFE7, 0xFFE7}, {0xFFEF, 0xFFFB}, {0xFFFE, 0xFFFF}, {0x1000C, 0x1000C}, {0x10027, 0x10027}, {0x1003B, 0x1003B},
 {0x1003E, 0x1003E}, {0x1004E, 0x1004F}, {0x1005E, 0x1007F}, {0x100FB, 0x100FF}, {0x10103, 0x10106}, {0x10134, 0x10136}, {0x1018F, 0x1018F}, {0x1019D, 0x1019F}, {0x101A1, 0x101CF}, {0x101FE, 0x1027F},
 {0x1029D, 0x1029F}, {0x102D1, 0x102DF}, {0x102FC, 0x102FF}, {0x10324, 0x1032C}, {0x1034B, 0x1034F}, {0x1037B, 0x1037F}, {0x1039E, 0x1039E}, {0x103C4, 0x103C7}, {0x103D6, 0x103FF}, {0x1049E, 0x1049F},
 {0x104AA, 0x104AF}, {0x104D4, 0x104D7}, {0x104FC, 0x104FF}, {0x10528, 0x1052F}, {0x10564, 0x1056E}, {0x10570, 0x105FF}, {0x10737, 0x1073F}, {0x10756, 0x1075F}, {0x10768, 0x107FF}, {0x10806, 0x10807},
 {0x10809, 0x10809}, {0x10836, 0x10836}, {0x10839, 0x1083B}, {0x1083D, 0x1083E}, {0x10856, 0x10856}, {0x1089F, 0x108A6}, {0x108B0, 0x108DF}, {0x108F3, 0x108F3}, {0x108F6, 0x108FA}, {0x1091C, 0x1091E},
 {0x1093A, 0x1093E}, {0x10940, 0x1097F}, {0x109B8, 0x109BB}, {0x109D0, 0x109D1}, {0x10A04, 0x10A04}, {0x10A07, 0x10A0B}, {0x10A14, 0x10A14}, {0x10A18, 0x10A18}, {0x10A36, 0x10A37}, {0x10A3B, 0x10A3E},
 {0x10A49, 0x10A4F}, {0x10A59, 0x10A5F}, {0x10AA0, 0x10ABF}, {0x10AE7, 0x10AEA}, {0x10AF7, 0x10AFF}, {0x10B36, 0x10B38}, {0x10B56, 0x10B57}, {0x10B73, 0x10B77}, {0x10B92, 0x10B98}, {0x10B9D, 0x10BA8},
 {0x10BB0, 0x10BFF}, {0x10C49, 0x10C7F}, {0x10CB3, 0x10CBF}, {0x10CF3, 0x10CF9}, {0x10D28, 0x10D2F}, {0x10D3A, 0x10E5F}, {0x10E7F, 0x10E7F}, {0x10EAA, 0x10EAA}, {0x10EAE, 0x10EAF}, {0x10EB2, 0x10EFF},
 {0x10F28, 0x10F2F}, {0x10F5A, 0x10FAF}, {0x10FCC, 0x10FDF}, {0x10FF7, 0x10FFF}, {0x1104E, 0x11051}, {0x11070, 0x1107E}, {0x110BD, 0x110BD}, {0x110C2, 0x110CF}, {0x110E9, 0x110EF}, {0x110FA, 0x110FF},
 {0x11135, 0x11135}, {0x11148, 0x1114F}, {0x11177, 0x1117F}, {0x111E0, 0x111E0}, {0x111F5, 0x111FF}, {0x11212, 0x11212}, {0x1123F, 0x1127F}, {0x11287, 0x11287}, {0x11289, 0x11289}, {0x1128E, 0x1128E},
 {0x1129E, 0x1129E}, {0x112AA, 0x112AF}, {0x112EB, 0x112EF}, {0x112FA, 0x112FF}, {0x11304, 0x11304}, {0x1130D, 0x1130E}, {0x11311, 0x11312}, {0x11329, 0x11329}, {0x11331, 0x11331}, {0x11334, 0x11334},
 {0x1133A, 0x1133A}, {0x11345, 0x11346}, {0x11349, 0x1134A}, {0x1134E, 0x1134F}, {0x11351, 0x11356}, {0x11358, 0x1135C}, {0x11364, 0x11365}, {0x1136D, 0x1136F}, {0x11375, 0x113FF}, {0x1145C, 0x1145C},
 {0x11462, 0x1147F}, {0x114C8, 0x114CF}, {0x114DA, 0x1157F}, {0x115B6, 0x115B7}, {0x115DE, 0x115FF}, {0x11645, 0x1164F}, {0x1165A, 0x1165F}, {0x1166D, 0x1167F}, {0x116B9, 0x116BF}, {0x116CA, 0x116FF},
 {0x1171B, 0x1171C}, {0x1172C, 0x1172F}, {0x11740, 0x117FF}, {0x1183C, 0x1189F}, {0x118F3, 0x118FE}, {0x11907, 0x11908}, {0x1190A, 0x1190B}, {0x11914, 0x11914}, {0x11917, 0x11917}, {0x11936, 0x11936},
 {0x11939, 0x1193A}, {0x11947, 0x1194F}, {0x1195A, 0x1199F}, {0x119A8, 0x119A9}, {0x119D8, 0x119D9}, {0x119E5, 0x119FF}, {0x11A48, 0x11A4F}, {0x11AA3, 0x11ABF}, {0x11AF9, 0x11BFF}, {0x11C09, 0x11C09},
 {0x11C37, 0x11C37}, {0x11C46, 0x11C4F}, {0x11C6D, 0x11C6F}, {0x11C90, 0x11C91}, {0x11CA8, 0x11CA8}, {0x11CB7, 0x11CFF}, {0x11D07, 0x11D07}, {0x11D0A, 0x11D0A}, {0x11D37, 0x11D39}, {0x11D3B, 0x11D3B},
 {0x11D3E, 0x11D3E}, {0x11D48, 0x11D4F}, {0x11D5A, 0x11D5F}, {0x11D66, 0x11D66}, {0x11D69, 0x11D69}, {0x11D8F, 0x11D8F}, {0x11D92, 0x11D92}, {0x11D99, 0x11D9F}, {0x11DAA, 0x11EDF}, {0x11EF9, 0x11FAF},
 {0x11FB1, 0x11FBF}, {0x11FF2, 0x11FFE}, {0x1239A, 0x123FF}, {0x1246F, 0x1246F}, {0x12475, 0x1247F}, {0x12544, 0x12FFF}, {0x1342F, 0x143FF}, {0x14647, 0x167FF}, {0x16A39, 0x16A3F}, {0x16A5F, 0x16A5F},
 {0x16A6A, 0x16A6D}, {0x16A70, 0x16ACF}, {0x16AEE, 0x16AEF}, {0x16AF6, 0x16AFF}, {0x16B46, 0x16B4F}, {0x16B5A, 0x16B5A}, {0x16B62, 0x16B62}, {0x16B78, 0x16B7C}, {0x16B90, 0x16E3F}, {0x16E9B, 0x16EFF},
 {0x16F4B, 0x16F4E}, {0x16F88, 0x16F8E}, {0x16FA0, 0x16FDF}, {0x16FE5, 0x16FEF}, {0x16FF2, 0x16FFF}, {0x187F8, 0x187FF}, {0x18CD6, 0x18CFF}, {0x18D09, 0x1AFFF}, {0x1B11F, 0x1B14F}, {0x1B153, 0x1B163},
 {0x1B168, 0x1B16F}, {0x1B2FC, 0x1BBFF}, {0x1BC6B, 0x1BC6F}, {0x1BC7D, 0x1BC7F}, {0x1BC89, 0x1BC8F}, {0x1BC9A, 0x1BC9B}, {0x1BCA0, 0x1CFFF}, {0x1D0F6, 0x1D0FF}, {0x1D127, 0x1D128}, {0x1D173, 0x1D17A},
 {0x1D1E9, 0x1D1FF}, {0x1D246, 0x1D2DF}, {0x1D2F4, 0x1D2FF}, {0x1D357, 0x1D35F}, {0x1D379, 0x1D3FF}, {0x1D455, 0x1D455}, {0x1D49D, 0x1D49D}, {0x1D4A0, 0x1D4A1}, {0x1D4A3, 0x1D4A4}, {0x1D4A7, 0x1D4A8},
 {0x1D4AD, 0x1D4AD}, {0x1D4BA, 0x1D4BA}, {0x1D4BC, 0x1D4BC}, {0x1D4C4, 0x1D4C4}, {0x1D506, 0x1D506}, {0x1D50B, 0x1D50C}, {0x1D515, 0x1D515}, {0x1D51D, 0x1D51D}, {0x1D53A, 0x1D53A}, {0x1D53F, 0x1D53F},
 {0x1D545, 0x1D545}, {0x1D547, 0x1D549}, {0x1D551, 0x1D551}, {0x1D6A6, 0x1D6A7}, {0x1D7CC, 0x1D7CD}, {0x1DA8C, 0x1DA9A}, {0x1DAA0, 0x1DAA0}, {0x1DAB0, 0x1DFFF}, {0x1E007, 0x1E007}, {0x1E019, 0x1E01A},
 {0x1E022, 0x1E022}, {0x1E025, 0x1E025}, {0x1E02B, 0x1E0FF}, {0x1E12D, 0x1E12F}, {0x1E13E, 0x1E13F}, {0x1E14A, 0x1E14D}, {0x1E150, 0x1E2BF}, {0x1E2FA, 0x1E2FE}, {0x1E300, 0x1E7FF}, {0x1E8C5, 0x1E8C6},
 {0x1E8D7, 0x1E8FF}, {0x1E94C, 0x1E94F}, {0x1E95A, 0x1E95D}, {0x1E960, 0x1EC70}, {0x1ECB5, 0x1ED00}, {0x1ED3E, 0x1EDFF}, {0x1EE04, 0x1EE04}, {0x1EE20, 0x1EE20}, {0x1EE23, 0x1EE23}, {0x1EE25, 0x1EE26},
 {0x1EE28, 0x1EE28}, {0x1EE33, 0x1EE33}, {0x1EE38, 0x1EE38}, {0x1EE3A, 0x1EE3A}, {0x1EE3C, 0x1EE41}, {0x1EE43, 0x1EE46}, {0x1EE48, 0x1EE48}, {0x1EE4A, 0x1EE4A}, {0x1EE4C, 0x1EE4C}, {0x1EE50, 0x1EE50},
 {0x1EE53, 0x1EE53}, {0x1EE55, 0x1EE56}, {0x1EE58, 0x1EE58}, {0x1EE5A, 0x1EE5A}, {0x1EE5C, 0x1EE5C}, {0x1EE5E, 0x1EE5E}, {0x1EE60, 0x1EE60}, {0x1EE63, 0x1EE63}, {0x1EE65, 0x1EE66}, {0x1EE6B, 0x1EE6B},
 {0x1EE73, 0x1EE73}, {0x1EE78, 0x1EE78}, {0x1EE7D, 0x1EE7D}, {0x1EE7F, 0x1EE7F}, {0x1EE8A, 0x1EE8A}, {0x1EE9C, 0x1EEA0}, {0x1EEA4, 0x1EEA4}, {0x1EEAA, 0x1EEAA}, {0x1EEBC, 0x1EEEF}, {0x1EEF2, 0x1EFFF},
 {0x1F02C, 0x1F02F}, {0x1F094, 0x1F09F}, {0x1F0AF, 0x1F0B0}, {0x1F0C0, 0x1F0C0}, {0x1F0D0, 0x1F0D0}, {0x1F0F6, 0x1F0FF}, {0x1F1AE, 0x1F1E5}, {0x1F203, 0x1F20F}, {0x1F23C, 0x1F23F}, {0x1F249, 0x1F24F},
 {0x1F252, 0x1F25F}, {0x1F266, 0x1F2FF}, {0x1F6D8, 0x1F6DF}, {0x1F6ED, 0x1F6EF}, {0x1F6FD, 0x1F6FF}, {0x1F774, 0x1F77F}, {0x1F7D9, 0x1F7DF}, {0x1F7EC, 0x1F7FF}, {0x1F80C, 0x1F80F}, {0x1F848, 0x1F84F},
 {0x1F85A, 0x1F85F}, {0x1F888, 0x1F88F}, {0x1F8AE, 0x1F8AF}, {0x1F8B2, 0x1F8FF}, {0x1F979, 0x1F979}, {0x1F9CC, 0x1F9CC}, {0x1FA54, 0x1FA5F}, {0x1FA6E, 0x1FA6F}, {0x1FA75, 0x1FA77}, {0x1FA7B, 0x1FA7F},
 {0x1FA87, 0x1FA8F}, {0x1FAA9, 0x1FAAF}, {0x1FAB7, 0x1FABF}, {0x1FAC3, 0x1FACF}, {0x1FAD7, 0x1FAFF}, {0x1FB93, 0x1FB93}, {0x1FBCB, 0x1FBEF}, {0x1FBFA, 0x1FFFF}, {0x2A6DE, 0x2A6FF}, {0x2B735, 0x2B73F},
 {0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF},
 };
 static std::string codepoint_to_utf8(uint32_t cp) {
    std::string result;
    if (/* 0x00 <= cp && */ cp <= 0x7f) {
        result.push_back(cp);
    }
    else if (0x80 <= cp && cp <= 0x7ff) {
        result.push_back(0xc0 | ((cp >> 6) & 0x1f));
        result.push_back(0x80 | (cp & 0x3f));
    }
    else if (0x800 <= cp && cp <= 0xffff) {
        result.push_back(0xe0 | ((cp >> 12) & 0x0f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
    }
    else if (0x10000 <= cp && cp <= 0x10ffff) {
        result.push_back(0xf0 | ((cp >> 18) & 0x07));
        result.push_back(0x80 | ((cp >> 12) & 0x3f));
        result.push_back(0x80 | ((cp >> 6) & 0x3f));
        result.push_back(0x80 | (cp & 0x3f));
    }
    else {
        throw std::invalid_argument("invalid codepoint");
    }
    return result;
 }
 static std::string codepoints_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
        result.append(codepoint_to_utf8(cps[i]));
    }
    return result;
 }
 static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];
        offset += 1;
        return result;
    }
    else if (!(utf8[offset + 0] & 0x40)) {
        throw std::invalid_argument("invalid character");
    }
    else if (!(utf8[offset + 0] & 0x20)) {
        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
            throw std::invalid_argument("invalid character");
        auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
        offset += 2;
        return result;
    }
    else if (!(utf8[offset + 0] & 0x10)) {
        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
            throw std::invalid_argument("invalid character");
        auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
        offset += 3;
        return result;
    }
    else if (!(utf8[offset + 0] & 0x08)) {
        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
            throw std::invalid_argument("invalid character");
        auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
        offset += 4;
        return result;
    }
    throw std::invalid_argument("invalid string");
 }
 static std::vector<uint32_t> codepoints_from_utf8(const std::string & utf8) {
    std::vector<uint32_t> result;
    size_t offset = 0;
    while (offset < utf8.size()) {
        result.push_back(codepoint_from_utf8(utf8, offset));
    }
    return result;
 }
 static std::vector<uint16_t> codepoint_to_utf16(uint32_t cp) {
    std::vector<uint16_t> result;
    if (/* 0x0000 <= cp && */ cp <= 0xffff) {
        result.emplace_back(cp);
    }
    else if (0x10000 <= cp && cp <= 0x10ffff) {
        result.emplace_back(0xd800 | ((cp - 0x10000) >> 10));
        result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff));
    }
    else {
        throw std::invalid_argument("invalid codepoint");
    }
    return result;
 }
 static std::vector<uint16_t> codepoints_to_utf16(const std::vector<uint32_t> & cps) {
    std::vector<uint16_t> result;
    for (size_t i = 0; i < cps.size(); ++i) {
        auto temp = codepoint_to_utf16(cps[i]);
        result.insert(result.end(), temp.begin(), temp.end());
    }
    return result;
 }
 static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t & offset) {
    assert(offset < utf16.size());
    if (((utf16[0] >> 10) << 10) != 0xd800) {
        auto result = utf16[offset + 0];
        offset += 1;
        return result;
    }
    else {
        if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
            throw std::invalid_argument("invalid character");
        auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
        offset += 2;
        return result;
    }
    throw std::invalid_argument("invalid string");
 }
 static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
    std::vector<uint32_t> result;
    size_t offset = 0;
    while (offset < utf16.size())
        result.push_back(codepoint_from_utf16(utf16, offset));
    return result;
 }
 #define CODEPOINT_TYPE_UNIDENTIFIED 0
 #define CODEPOINT_TYPE_DIGIT 1
 #define CODEPOINT_TYPE_LETTER 2
 #define CODEPOINT_TYPE_WHITESPACE 3
 #define CODEPOINT_TYPE_ACCENT_MARK 4
 #define CODEPOINT_TYPE_PUNCTUATION 5
 #define CODEPOINT_TYPE_SYMBOL 6
 #define CODEPOINT_TYPE_CONTROL 7
 static std::unordered_map<uint32_t, int> codepoint_type_map() {
    std::unordered_map<uint32_t, int> codepoint_types;
    for (auto p : digit_ranges) {
        for(auto i = p.first; i <= p.second; ++ i)
            codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
    }
    for(auto p : letter_ranges) {
        for(auto i = p.first; i <= p.second; ++ i)
            codepoint_types[i] = CODEPOINT_TYPE_LETTER;
    }
    for(auto p : whitespace_ranges) {
        for(auto i = p.first; i <= p.second; ++ i)
            codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
    }
    for(auto p : accent_mark_ranges) {
        for(auto i = p.first; i <= p.second; ++ i)
            codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
    }
    for(auto p : punctuation_ranges) {
        for(auto i = p.first; i <= p.second; ++ i)
            codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
    }
    for (auto p : symbol_ranges) {
        for (auto i = p.first; i <= p.second; ++i)
            codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
    }
    for(auto p : control_ranges) {
        for(auto i = p.first; i <= p.second; ++ i)
            codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
    }
    return codepoint_types;
 }
 static int codepoint_type(uint32_t cp) {
    static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
    return codepoint_types[cp];
 }
 static int codepoint_type(const std::string & utf8) {
    if (utf8.length() == 0)
        return CODEPOINT_TYPE_UNIDENTIFIED;
    size_t offset = 0;
    return codepoint_type(codepoint_from_utf8(utf8, offset));
 }
 static std::unordered_map<uint8_t, std::string> bytes_to_unicode_map_bpe() {
    std::unordered_map<uint8_t, std::string> map;
    for (int ch = u'!'; ch <= u'~'; ++ch) {
        assert(0 <= ch && ch < 256);
        map[ch] = codepoint_to_utf8(ch);
    }
    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
        assert(0 <= ch && ch < 256);
        map[ch] = codepoint_to_utf8(ch);
    }
    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
        assert(0 <= ch && ch < 256);
        map[ch] = codepoint_to_utf8(ch);
    }
    auto n = 0;
    for (int ch = 0; ch < 256; ++ch) {
        if (map.find(ch) == map.end()) {
            map[ch] = codepoint_to_utf8(256 + n);
            ++n;
        }
    }
    return map;
 }
 static std::string bytes_to_unicode_bpe(uint8_t byte) {
    static std::unordered_map<uint8_t, std::string> map = bytes_to_unicode_map_bpe();
    return map.at(byte);
 }
 static std::unordered_map<std::string, uint8_t> unicode_to_bytes_map_bpe() {
    std::unordered_map<std::string, uint8_t> map;
    for (int ch = u'!'; ch <= u'~'; ++ch) {
        assert(0 <= ch && ch < 256);
        map[codepoint_to_utf8(ch)] = ch;
    }
    for (int ch = u'¡'; ch <= u'¬'; ++ch) {
        assert(0 <= ch && ch < 256);
        map[codepoint_to_utf8(ch)] = ch;
    }
    for (int ch = u'®'; ch <= u'ÿ'; ++ch) {
        assert(0 <= ch && ch < 256);
        map[codepoint_to_utf8(ch)] = ch;
    }
    auto n = 0;
    for (int ch = 0; ch < 256; ++ch) {
        if (map.find(codepoint_to_utf8(ch)) == map.end()) {
            map[codepoint_to_utf8(256 + n)] = ch;
            ++n;
        }
    }
    return map;
 }
 static uint8_t unicode_to_bytes_bpe(const std::string & utf8) {
    static std::unordered_map<std::string, uint8_t> map = unicode_to_bytes_map_bpe();
    return map.at(utf8);
 }
 std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -29,18 +29,6 @@ std::string g_status_forced = "";
 std::vector<float> g_pcmf32;
 std::string to_timestamp(int64_t t) {
    int64_t sec = t/100;
    int64_t msec = t - sec*100;
    int64_t min = sec/60;
    sec = sec - min*60;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
    return std::string(buf);
 }
 void talk_set_status(const std::string & status) {
    std::lock_guard<std::mutex> lock(g_mutex);
    g_status = status;
--- a/examples/talk/.gitignore
+++ b/examples/talk/.gitignore
@ -1 +1,2 @@
 audio.mp3
 to_speak.txt
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -11,9 +11,13 @@ Web version: [examples/talk.wasm](/examples/talk.wasm)
 The `talk` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
-# Install SDL2 on Linux
+# Install SDL2
 # On Debian based linux distributions:
 sudo apt-get install libsdl2-dev
 # On Fedora Linux:
 sudo dnf install SDL2 SDL2-devel
 # Install SDL2 on Mac OS
 brew install sdl2
--- a/examples/talk/eleven-labs.py
+++ b/examples/talk/eleven-labs.py
@ -1,20 +1,80 @@
 import sys
-import importlib.util
+import argparse
 import textwrap
-if importlib.util.find_spec("elevenlabs") is None:
+parser = argparse.ArgumentParser(add_help=False,
-    print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
+    formatter_class=argparse.RawTextHelpFormatter)
 parser.add_argument("-q", "--quick", action="store_true",
    help="skip checking the required library")
 modes = parser.add_argument_group("action")
 modes.add_argument("inputfile", metavar="TEXTFILE",
    nargs='?', type=argparse.FileType(), default=sys.stdin,
    help="read the text file (default: stdin)")
 modes.add_argument("-l", "--list", action="store_true",
    help="show the list of voices and exit")
 modes.add_argument("-h", "--help", action="help",
    help="show this help and exit")
 selopts = parser.add_argument_group("voice selection")
 selmodes = selopts.add_mutually_exclusive_group()
 selmodes.add_argument("-n", "--name",
    default="Arnold",
    help="get a voice object by name (default: Arnold)")
 selmodes.add_argument("-v", "--voice", type=int, metavar="NUMBER",
    help="get a voice object by number (see --list)")
 selopts.add_argument("-f", "--filter", action="append", metavar="KEY=VAL",
    default=["use case=narration"],
    help=textwrap.dedent('''\
        filter voices by labels (default: "use case=narration")
        this option can be used multiple times
        filtering will be disabled if the first -f has no "=" (e.g. -f "any")
        '''))
 outmodes = parser.add_argument_group("output")
 outgroup = outmodes.add_mutually_exclusive_group()
 outgroup.add_argument("-s", "--save", metavar="FILE",
    default="audio.mp3",
    help="save the TTS to a file (default: audio.mp3)")
 outgroup.add_argument("-p", "--play", action="store_true",
    help="play the TTS with ffplay")
 args = parser.parse_args()
 if not args.quick:
    import importlib.util
    if importlib.util.find_spec("elevenlabs") is None:
        print("elevenlabs library is not installed, you can install it to your enviroment using 'pip install elevenlabs'")
        sys.exit()
 from elevenlabs import voices, generate, play, save
 if args.filter and "=" in args.filter[0]:
    voicelist = voices()
    for f in args.filter:
        label, value = f.split("=")
        voicelist = filter(lambda x: x.labels.get(label) == value, voicelist)
    voicelist = list(voicelist)
 else:
    voicelist = list(voices())
 if args.list:
    for i, v in enumerate(voicelist):
        print(str(i) + ": " + v.name + " " + str(v.labels))
    sys.exit()
-from elevenlabs import generate, play, save
+if args.voice:
    voice = voicelist[args.voice % len(voicelist)]
 else:
    voice = args.name
    # if -n should consult -f, use the following
    #voice = next(x for x in voicelist if x.name == args.name)
 # Get a Voice object, by name or UUID
 voice = "Arnold" #Possible Voices: Adam Antoni Arnold Bella Domi Elli Josh
 # Generate the TTS
 audio = generate(
-  text=str(sys.argv[2:]),
+    text=str(args.inputfile.read()),
-  voice=voice
+    voice=voice
 )
-
+if args.play:
-# Save the TTS to a file
+    play(audio)
-save(audio, "audio.mp3") 
+else:
    save(audio, args.save) 
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -72,7 +72,7 @@ struct gpt2_model {
 };
 // load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
+static bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
    auto fin = std::ifstream(fname, std::ios::binary);
@ -380,7 +380,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - embd_w:    the predicted logits for the next token
 //
 // TODO: sync latest version from ggml repo
-bool gpt2_eval(
+static bool gpt2_eval(
        const gpt2_model & model,
        const int n_threads,
        const int n_past,
--- a/examples/talk/speak
+++ b/examples/talk/speak
@ -1,24 +1,40 @@
 #!/bin/bash
 # Usage:
-#  speak.sh <voice_id> <text-to-speak>
+#  speak <voice_id> <textfile>
-# espeak
+function installed() { command -v $1 >/dev/null 2>&1; }
 # Mac OS: brew install espeak
 # Linux: apt-get install espeak
 #
 #espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
-# Mac OS "say" command
+if installed espeak; then
-say "$2"
+  espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 -f $2
 elif installed piper && installed aplay; then
  cat $2 | piper --model ~/en_US-lessac-medium.onnx --output-raw | aplay -q -r 22050 -f S16_LE -t raw -
 # for Mac
 elif installed say; then
  say -f $2
 # Eleven Labs
-# To use it, install the elevenlabs module from pip (pip install elevenlabs)
+elif installed python3 && \
-# It's possible to use the API for free with limited number of characters. To increase this limit register to https://beta.elevenlabs.io to get an api key and paste it after 'ELEVEN_API_KEY='
+  python3 -c 'import importlib.util; exit(not importlib.util.find_spec("elevenlabs"))' && \
-#Keep the line commented to use the free version without api key
+  installed ffplay; then
-#
+    # It's possible to use the API for free with limited number of characters.
-#export ELEVEN_API_KEY=your_api_key
+    # To increase this limit register to https://beta.elevenlabs.io to get an api key
-#wd=$(dirname $0)
+    # and paste it after 'ELEVEN_API_KEY='
-#script=$wd/eleven-labs.py
+    # Keep the line commented to use the free version without api key
-#python3 $script $1 "$2"
+    #export ELEVEN_API_KEY=your_api_key
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3
+    wd=$(dirname $0)
    script=$wd/eleven-labs.py
    python3 $script -q -p -v $1 $2 >/dev/null 2>&1
    # Uncomment to keep the audio file
    #python3 $script -q -s ./audio.mp3 -v $1 $2 >/dev/null 2>&1
    #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
 else
  echo 'Install espeak ("brew install espeak" or "apt-get install espeak"),'
  echo 'piper ("pip install piper-tts" or https://github.com/rhasspy/piper) with aplay,'
  echo 'or elevenlabs ("pip install elevenlabs") with ffplay.'
  echo '(export ELEVEN_API_KEY if you have an api key from https://beta.elevenlabs.io)'
 fi
--- a/Show More
+++ b/Show More
`@ -3,4 +3,4 @@ add_executable(${TARGET} main.cpp)`

	`include(DefaultTargetOptions)`	`include(DefaultTargetOptions)`

	`target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})`	`target_link_libraries(${TARGET} PRIVATE common whisper ${FFMPEG_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})`
`@ -1 +1 @@`
	`@powershell -ExecutionPolicy Bypass -F examples\talk\speak.ps1 %1 %2`	`@powershell -ExecutionPolicy Bypass -F examples\talk-llama\speak.ps1 %1 %2`