talk-llama : sync llama.cpp

ggml-ci
whisper : update to ggml-backend changes (#0 )
2025-08-11 08:33:50 +02:00 · 2025-05-13 13:20:19 +03:00 · 2025-05-13 13:11:24 +03:00 · 2025-05-13 13:10:17 +03:00 · 2025-05-13 13:10:08 +03:00 · 2025-05-13 13:09:20 +03:00
282 changed files with 25927 additions and 49783 deletions
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -16,7 +16,6 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 RUN apt-get update && \
    apt-get install -y build-essential libsdl2-dev wget cmake git \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 # Ref: https://stackoverflow.com/a/53464012
@ -27,12 +26,6 @@ COPY .. .
 # Enable cuBLAS
 RUN make base.en CMAKE_ARGS="-DGGML_CUDA=1"
 RUN find /app/build -name "*.o" -delete && \
    find /app/build -name "*.a" -delete && \
    rm -rf /app/build/CMakeFiles && \
    rm -rf /app/build/cmake_install.cmake && \
    rm -rf /app/build/_deps
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 ENV CUDA_MAIN_VERSION=12.3
 ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
@ -40,11 +33,8 @@ WORKDIR /app
 RUN apt-get update && \
  apt-get install -y curl ffmpeg wget cmake git \
  && apt-get clean \
  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 COPY --from=build /app /app
 RUN du -sh /app/*
 RUN find /app -type f -size +100M
 ENV PATH=/app/build/bin:$PATH
 ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -1,28 +0,0 @@
 ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 WORKDIR /app
 RUN apt-get update && \
    apt-get install -y build-essential libsdl2-dev wget cmake git \
    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 COPY .. .
 # Enable SYCL
 ARG GGML_SYCL_F16=OFF
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    make base.en CMAKE_ARGS="-DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16}"
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
 WORKDIR /app
 RUN apt-get update && \
  apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
 COPY --from=build /app /app
 ENV PATH=/app/build/bin:$PATH
 ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main-musa.Dockerfile
+++ b/.devops/main-musa.Dockerfile
@ -1,39 +1,29 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc4.0.1
+ARG MUSA_VERSION=rc3.1.1
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the MUSA runtime image
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 WORKDIR /app
 RUN apt-get update && \
-    apt-get install -y build-essential libsdl2-dev wget cmake git && \
+    apt-get install -y build-essential libsdl2-dev wget cmake git \
-    apt-get clean && \
+    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*
 COPY .. .
 # Enable muBLAS
 RUN make base.en CMAKE_ARGS="-DGGML_MUSA=1"
 RUN find /app/build -name "*.o" -delete && \
    find /app/build -name "*.a" -delete && \
    rm -rf /app/build/CMakeFiles && \
    rm -rf /app/build/cmake_install.cmake && \
    rm -rf /app/build/_deps
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 WORKDIR /app
 RUN apt-get update && \
-    apt-get install -y curl ffmpeg wget cmake git && \
+  apt-get install -y curl ffmpeg wget cmake git \
-    apt-get clean && \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*
 COPY --from=build /app /app
 RUN du -sh /app/*
 RUN find /app -type f -size +100M
 ENV PATH=/app/build/bin:$PATH
 ENTRYPOINT [ "bash", "-c" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -101,10 +101,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt update
            apt install -y build-essential libsdl2-dev cmake git
            cmake -B build
@ -133,14 +129,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt-get update
            apt-get install -y ca-certificates
            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
            apt update
            apt install -y build-essential libsdl2-dev cmake git
            cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
@ -169,14 +157,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt-get update
            apt-get install -y ca-certificates
            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
            apt update
            apt install -y build-essential libsdl2-dev cmake git
            cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
@ -262,10 +242,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt update
            apt install -y build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
@ -296,14 +272,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt-get update
            apt-get install -y ca-certificates
            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
            apt update
            apt install -y build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
@ -334,14 +302,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt-get update
            apt-get install -y ca-certificates
            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
            apt update
            apt install -y build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
@ -375,14 +335,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt-get update
            apt-get install -y ca-certificates
            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
            apt update
            apt install -y clang build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
@ -413,10 +365,6 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
            export DEBIAN_FRONTEND=noninteractive
            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
            apt update
            apt install -y build-essential cmake git
            cmake . -DCMAKE_BUILD_TYPE=Debug \
@ -681,14 +629,11 @@ jobs:
        arch: [Win32, x64]
        blas: [ON]
        sdl2: [ON]
        blasver: [0.3.29]
        include:
          - arch: Win32
            s2arc: x86
            blasfile: x86
          - arch: x64
            s2arc: x64
            blasfile: x64_64
          - sdl2: ON
            s2ver: 2.28.5
@ -709,8 +654,7 @@ jobs:
      - name: Install OpenBLAS and pkgconfiglite
        if: matrix.blas == 'ON'
        run: |
-          Invoke-WebRequest "https://github.com/OpenMathLib/OpenBLAS/releases/download/v${{matrix.blasver}}/OpenBLAS-${{matrix.blasver}}_${{matrix.blasfile}}.zip" -OutFile "OpenBLAS-${{matrix.blasver}}.zip"
+          vcpkg install --triplet=${{ matrix.s2arc }}-windows openblas
          Expand-Archive "OpenBLAS-${{matrix.blasver}}.zip" -DestinationPath "OpenBLAS-${{matrix.blasver}}"
          choco install pkgconfiglite
      - name: Fetch SDL2 and set SDL2_DIR
@ -727,8 +671,6 @@ jobs:
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
          -DGGML_BLAS=${{ matrix.blas }}
          -DGGML_BLAS_VENDOR=OpenBLAS
          -DBLAS_LIBRARIES="$env:GITHUB_WORKSPACE/OpenBLAS-${{matrix.blasver}}/lib/libopenblas.lib"
          -DBLAS_INCLUDE_DIRS="$env:GITHUB_WORKSPACE/OpenBLAS-${{matrix.blasver}}/include"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}
      - name: Build
@ -738,7 +680,7 @@ jobs:
      - name: Copy openblas.dll
        if: matrix.blas == 'ON'
-        run: copy "$env:GITHUB_WORKSPACE/OpenBLAS-${{matrix.blasver}}/bin/libopenblas.dll" build/bin/${{ matrix.build }}
+        run: copy "C:/vcpkg/packages/openblas_${{ matrix.s2arc }}-windows/bin/openblas.dll" build/bin/${{ matrix.build }}
      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
@ -761,15 +703,14 @@ jobs:
  windows-cublas:
    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
            github.event.inputs.run_type == 'full-ci' }}
-    runs-on: windows-2022
+    runs-on: windows-2019
    strategy:
      fail-fast: false
      matrix:
        build: [Release]
        arch: [x64]
        cublas: [ON]
        sdl2: [ON]
-        cuda-toolkit: [12.4.0, 11.8.0]
+        cuda-toolkit: [12.2.0, 11.8.0]
        include:
          - arch: x64
            sdl2: ON
@ -837,7 +778,7 @@ jobs:
          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
          # Visual Studio integration
-          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\v170\BuildCustomizations" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
          # Set environment variables
          echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
@ -845,23 +786,23 @@ jobs:
          echo "CUDA_PATH=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V11_8=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
-      - name: Install Cuda Toolkit 12.4.0
+      - name: Install Cuda Toolkit 12.2.0
-        if: ${{ matrix.cuda-toolkit == '12.4.0' }}
+        if: ${{ matrix.cuda-toolkit == '12.2.0' }}
        run: |
          $CUDA_VERSION = ${{ matrix.cuda-toolkit }}
          $CUDA_TOOLKIT_DIR = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_VERSION"
          $CUDA_DOWNLOAD = "https://developer.download.nvidia.com/compute/cuda/redist"
          # Components versions
-          $CUDART_VER   = "12.4.127"
+          $CUDART_VER   = "12.2.140"
-          $NVCC_VER     = "12.4.131"
+          $NVCC_VER     = "12.2.140"
-          $NVRTC_VER    = "12.4.127"
+          $NVRTC_VER    = "12.2.140"
-          $CUBLAS_VER   = "12.4.5.8"
+          $CUBLAS_VER   = "12.2.5.6"
-          $NVTX_VER     = "12.4.127"
+          $NVTX_VER     = "12.2.140"
-          $PROFILER_VER = "12.4.127"
+          $PROFILER_VER = "12.2.140"
-          $VS_VER       = "12.4.127"
+          $VS_VER       = "12.2.140"
-          $NVPROF_VER   = "12.4.128"
+          $NVPROF_VER   = "12.2.142"
-          $CCCL_VER     = "12.4.127"
+          $CCCL_VER     = "12.2.140"
          # Create the directory where the CUDA Toolkit will be installed
          mkdir -p $CUDA_TOOLKIT_DIR
@ -895,7 +836,7 @@ jobs:
          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
          # Visual Studio integration
-          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\v170\BuildCustomizations" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
          # Set environment variables
          echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
@ -923,21 +864,14 @@ jobs:
      - name: Build Project
        shell: cmd
        run: |
-          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
          cmake --version
          where cmake
          if "${{ matrix.cuda-toolkit }}" == "11.8.0" (
            set CUDA_FLAGS=-allow-unsupported-compiler -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH -D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR
          ) else (
            set CUDA_FLAGS=
          )
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
            -DGGML_CUDA=${{ matrix.cublas }} ^
            -DWHISPER_SDL2=${{ matrix.sdl2 }} ^
-            -DSDL2_DIR="%SDL2_DIR%" ^
+            -DSDL2_DIR="%SDL2_DIR%"
            -DCMAKE_POLICY_VERSION_MINIMUM=3.5 ^
            -DCMAKE_CUDA_FLAGS="%CUDA_FLAGS%"
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config ${{ matrix.build }} -j %NUMBER_OF_PROCESSORS%
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -19,8 +19,9 @@ jobs:
        config:
          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" }
          - { tag: "main-musa", dockerfile: ".devops/main-musa.Dockerfile", platform: "linux/amd64" }
-          - { tag: "main-intel", dockerfile: ".devops/main-intel.Dockerfile", platform: "linux/amd64" }
+          #TODO: the cuda image keeps failing - disable for now
-          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
+          #      https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
          #- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
    steps:
      - name: Check out the repo
--- a/.gitignore
+++ b/.gitignore
@ -14,7 +14,6 @@
 build/
 build-*/
 build_*/
 # SPM
 .build/
@ -50,8 +49,6 @@ extra/bench-gg.txt
 models/*.mlmodel
 models/*.mlmodelc
 models/*.mlpackage
 models/*-encoder-openvino.xml
 models/*-encoder-openvino-cache/
 bindings/java/.gradle/
 bindings/java/.idea/
 .idea/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
 project("whisper.cpp" C CXX)
-project("whisper.cpp" VERSION 1.7.6)
+project("whisper.cpp" VERSION 1.7.5)
 include(CheckIncludeFileCXX)
 set(SOVERSION 1)
@ -119,11 +119,6 @@ whisper_option_depr(WARNING     WHISPER_SYCL                GGML_SYCL)
 whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
 whisper_option_depr(WARNING     WHISPER_CCACHE              GGML_CCACHE)
 if (GGML_CUDA AND NOT MSVC)
    #GGML_CUDA enabled, add the necessary compile options -Wno-deprecated-gpu-targets
    add_compile_options(-Wno-deprecated-gpu-targets)
 endif()
 #
 # build the library
 #
@ -246,6 +241,5 @@ if (MSVC)
        disable_msvc_warnings(whisper-talk-llama)
        disable_msvc_warnings(whisper-bench)
        disable_msvc_warnings(quantize)
        disable_msvc_warnings(vad-speech-segments)
    endif()
 endif()
--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Stable: [v1.7.6](https://github.com/ggml-org/whisper.cpp/releases/tag/v1.7.6) / [Roadmap](https://github.com/orgs/ggml-org/projects/4/)
+Stable: [v1.7.5](https://github.com/ggml-org/whisper.cpp/releases/tag/v1.7.5) / [Roadmap](https://github.com/orgs/ggml-org/projects/4/)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -35,7 +35,7 @@ Supported platforms:
 - [x] [Java](bindings/java/README.md)
 - [x] Linux / [FreeBSD](https://github.com/ggml-org/whisper.cpp/issues/56#issuecomment-1350920264)
 - [x] [WebAssembly](examples/whisper.wasm)
- [x] Windows ([MSVC](https://github.com/ggml-org/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggml-org/whisper.cpp/issues/168))
+- [x] Windows ([MSVC](https://github.com/ggml-org/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggml-org/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggml-org/whisper.cpp/discussions/166)
 - [x] [Docker](https://github.com/ggml-org/whisper.cpp/pkgs/container/whisper.cpp)
@ -386,7 +386,7 @@ Run the inference examples as usual, for example:
 ## Moore Threads GPU support
 With Moore Threads cards the processing of the models is done efficiently on the GPU via muBLAS and custom MUSA kernels.
-First, make sure you have installed `MUSA SDK rc4.0.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=4.0.1
+First, make sure you have installed `MUSA SDK rc3.1.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=rc3.1.1
 Now build `whisper.cpp` with MUSA support:
@ -709,9 +709,7 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
 ## XCFramework
 The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
 and macOS. It can be used in Swift projects without the need to compile the
-library from source. For example, the v1.7.5 version of the XCFramework can be
+library from source. For examples:
 used as follows:
 ```swift
 // swift-tools-version: 5.10
 // The swift-tools-version declares the minimum version of Swift required to build this package.
@ -735,7 +733,7 @@ let package = Package(
 )
 ```
-## Voice Activity Detection (VAD)
+### Voice Activity Detection (VAD)
 Support for Voice Activity Detection (VAD) can be enabled using the `--vad`
 argument to `whisper-cli`. In addition to this option a VAD model is also
 required.
@ -749,36 +747,11 @@ transcription process.
 The following VAD models are currently supported:
-### Silero-VAD
+#### Silero-VAD
 [Silero-vad](https://github.com/snakers4/silero-vad) is a lightweight VAD model
 written in Python that is fast and accurate.
-Models can be downloaded by running the following command on Linux or MacOS:
+This model can be converted to ggml using the following command:
 ```console
 $ ./models/download-vad-model.sh silero-v5.1.2
 Downloading ggml model silero-v5.1.2 from 'https://huggingface.co/ggml-org/whisper-vad' ...
 ggml-silero-v5.1.2.bin        100%[==============================================>] 864.35K  --.-KB/s    in 0.04s
 Done! Model 'silero-v5.1.2' saved in '/path/models/ggml-silero-v5.1.2.bin'
 You can now use it like this:
  $ ./build/bin/whisper-cli -vm /path/models/ggml-silero-v5.1.2.bin --vad -f samples/jfk.wav -m models/ggml-base.en.bin
 ```
 And the following command on Windows:
 ```console
 > .\models\download-vad-model.cmd silero-v5.1.2
 Downloading vad model silero-v5.1.2...
 Done! Model silero-v5.1.2 saved in C:\Users\danie\work\ai\whisper.cpp\ggml-silero-v5.1.2.bin
 You can now use it like this:
 C:\path\build\bin\Release\whisper-cli.exe -vm C:\path\ggml-silero-v5.1.2.bin --vad -m models/ggml-base.en.bin -f samples\jfk.wav
 ```
 To see a list of all available models, run the above commands without any
 arguments.
 This model can be also be converted manually to ggml using the following command:
 ```console
 $ python3 -m venv venv && source venv/bin/activate
 $ (venv) pip install silero-vad
@ -794,7 +767,7 @@ $ ./build/bin/whisper-cli \
   --vad-model ./models/silero-v5.1.2-ggml.bin
 ```
-### VAD Options
+#### VAD Options
 * --vad-threshold: Threshold probability for speech detection. A probability
 for a speech segment/frame above this threshold will be considered as speech.
--- a/README_sycl.md
+++ b/README_sycl.md
@ -1,249 +1,249 @@
-# whisper.cpp for SYCL
+# whisper.cpp for SYCL
-
+
-[Background](#background)
+[Background](#background)
-
+
-[OS](#os)
+[OS](#os)
-
+
-[Intel GPU](#intel-gpu)
+[Intel GPU](#intel-gpu)
-
+
-[Linux](#linux)
+[Linux](#linux)
-
+
-[Environment Variable](#environment-variable)
+[Environment Variable](#environment-variable)
-
+
-[Known Issue](#known-issue)
+[Known Issue](#known-issue)
-
+
-[Todo](#todo)
+[Todo](#todo)
-
+
-## Background
+## Background
-
+
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators<EFBFBD>such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
-
+
-oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
-
+
-Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
-
+
-To avoid  re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+To avoid  re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel<EFBFBD> DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
-
+
-The whisper.cpp for SYCL is used to support Intel GPUs.
+The whisper.cpp for SYCL is used to support Intel GPUs.
-
+
-For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
+For Intel CPU, recommend to use whisper.cpp for X86 (Intel MKL build).
-
+
-## OS
+## OS
-
+
-|OS|Status|Verified|
+|OS|Status|Verified|
-|-|-|-|
+|-|-|-|
-|Linux|Support|Ubuntu 22.04|
+|Linux|Support|Ubuntu 22.04|
-|Windows|Ongoing| |
+|Windows|Ongoing| |
-
+
-
+
-## Intel GPU
+## Intel GPU
-
+
-|Intel GPU| Status | Verified Model|
+|Intel GPU| Status | Verified Model|
-|-|-|-|
+|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770|
+|Intel Arc Series| Support| Arc 770|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
-
+
-
+
-## Linux
+## Linux
-
+
-### Setup Environment
+### Setup Environment
-
+
-1. Install Intel GPU driver.
+1. Install Intel GPU driver.
-
+
-a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
-
+
-Note: for iGPU, please install the client GPU driver.
+Note: for iGPU, please install the client GPU driver.
-
+
-b. Add user to group: video, render.
+b. Add user to group: video, render.
-
+
-```
+```
-sudo usermod -aG render username
+sudo usermod -aG render username
-sudo usermod -aG video username
+sudo usermod -aG video username
-```
+```
-
+
-Note: re-login to enable it.
+Note: re-login to enable it.
-
+
-c. Check
+c. Check
-
+
-```
+```
-sudo apt install clinfo
+sudo apt install clinfo
-sudo clinfo -l
+sudo clinfo -l
-```
+```
-
+
-Output (example):
+Output (example):
-
+
-```
+```
-Platform #0: Intel(R) OpenCL Graphics
+Platform #0: Intel(R) OpenCL Graphics
- `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
-
+
-
+
-Platform #0: Intel(R) OpenCL HD Graphics
+Platform #0: Intel(R) OpenCL HD Graphics
- `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
-```
+```
-
+
-2. Install Intel® oneAPI Base toolkit.
+2. Install Intel<EFBFBD> oneAPI Base toolkit.
-
+
-
+
-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+a. Please follow the procedure in [Get the Intel<EFBFBD> oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
-
+
-Recommend to install to default folder: **/opt/intel/oneapi**.
+Recommend to install to default folder: **/opt/intel/oneapi**.
-
+
-Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
-
+
-b. Check
+b. Check
-
+
-```
+```
-source /opt/intel/oneapi/setvars.sh
+source /opt/intel/oneapi/setvars.sh
-
+
-sycl-ls
+sycl-ls
-```
+```
-
+
-There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
-
+
-Output (example):
+Output (example):
-```
+```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-
+
-```
+```
-
+
-2. Build locally:
+2. Build locally:
-
+
-```
+```
-mkdir -p build
+mkdir -p build
-cd build
+cd build
-source /opt/intel/oneapi/setvars.sh
+source /opt/intel/oneapi/setvars.sh
-
+
-#for FP16
+#for FP16
-#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON 
+#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON 
-
+
-#for FP32
+#for FP32
-cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
+
-#build example/main only
+#build example/main only
-#cmake --build . --config Release --target main
+#cmake --build . --config Release --target main
-
+
-#build all binary
+#build all binary
-cmake --build . --config Release -v
+cmake --build . --config Release -v
-
+
-```
+```
-
+
-or
+or
-
+
-```
+```
-./examples/sycl/build.sh
+./examples/sycl/build.sh
-```
+```
-
+
-Note:
+Note:
-
+
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
-
+
-### Run
+### Run
-
+
-1. Put model file to folder **models**
+1. Put model file to folder **models**
-
+
-2. Enable oneAPI running environment
+2. Enable oneAPI running environment
-
+
-```
+```
-source /opt/intel/oneapi/setvars.sh
+source /opt/intel/oneapi/setvars.sh
-```
+```
-
+
-3. List device ID
+3. List device ID
-
+
-Run without parameter:
+Run without parameter:
-
+
-```
+```
-./build/bin/ls-sycl-device
+./build/bin/ls-sycl-device
-
+
-or
+or
-
+
-./build/bin/main
+./build/bin/main
-```
+```
-
+
-Check the ID in startup log, like:
+Check the ID in startup log, like:
-
+
-```
+```
-found 4 SYCL devices:
+found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-
+
-```
+```
-
+
-|Attribute|Note|
+|Attribute|Note|
-|-|-|
+|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-
+
-4. Set device ID and execute whisper.cpp
+4. Set device ID and execute whisper.cpp
-
+
-Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
-
+
-```
+```
-GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
+GGML_SYCL_DEVICE=0 ./build/bin/main -m models/ggml-base.en.bin -f samples/jfk.wav
-```
+```
-or run by script:
+or run by script:
-
+
-```
+```
-./examples/sycl/run_whisper.sh
+./examples/sycl/run_whisper.sh
-```
+```
-
+
-
+
-
+
-5. Check the device ID in output
+5. Check the device ID in output
-
+
-Like:
+Like:
-```
+```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
-```
+```
-
+
-
+
-## Environment Variable
+## Environment Variable
-
+
-#### Build
+#### Build
-
+
-|Name|Value|Function|
+|Name|Value|Function|
-|-|-|-|
+|-|-|-|
-|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
+|WHISPER_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, WHISPER_SYCL=ON is mandatory.|
-|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
+|WHISPER_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path.For FP32, do not set it.|
-|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
-|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
-
+
-#### Running
+#### Running
-
+
-
+
-|Name|Value|Function|
+|Name|Value|Function|
-|-|-|-|
+|-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
-
+
-## Known Issue
+## Known Issue
-
+
- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
-
+
-  Miss to enable oneAPI running environment.
+  Miss to enable oneAPI running environment.
-
+
-  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
-
+
-
+
- Hang during startup
+- Hang during startup
-
+
-  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
-
+
-  Solution: add **--no-mmap**.
+  Solution: add **--no-mmap**.
-
+
-## Todo
+## Todo
-
+
- Support to build in Windows.
+- Support to build in Windows.
-
+
- Support multiple cards.
+- Support multiple cards.
--- a/bindings/java/README.md
+++ b/bindings/java/README.md
@ -23,42 +23,26 @@ import io.github.ggerganov.whispercpp.WhisperCpp;
 public class Example {
    public static void main(String[] args) {
        WhisperCpp whisper = new WhisperCpp();
        // By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
        // or you can provide the absolute path to the model file.
        long context = whisper.initContext("base.en");
        try {
-            // By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
+            var whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
-            // or you can provide the absolute path to the model file.
+            // custom configuration if required
-            whisper.initContext("../ggml-base.en.bin"); 
+            whisperParams.temperature_inc = 0f;
            WhisperFullParams.ByValue whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH); 
            // custom configuration if required      
            //whisperParams.n_threads = 8;
            whisperParams.temperature = 0.0f;
            whisperParams.temperature_inc = 0.2f;
            //whisperParams.language = "en";
            float[] samples = readAudio(); // divide each value by 32767.0f
            List<WhisperSegment> whisperSegmentList = whisper.fullTranscribeWithTime(whisperParams, samples);
            for (WhisperSegment whisperSegment : whisperSegmentList) {
-                long start = whisperSegment.getStart();
+            var samples = readAudio(); // divide each value by 32767.0f
-                long end = whisperSegment.getEnd();
+            whisper.fullTranscribe(whisperParams, samples);
-                String text = whisperSegment.getSentence();
+            int segmentCount = whisper.getTextSegmentCount(context);
-                    
+            for (int i = 0; i < segmentCount; i++) {
-                System.out.println("start: "+start);
+                String text = whisper.getTextSegment(context, i);
-                System.out.println("end: "+end);
+                System.out.println(segment.getText());
                System.out.println("text: "+text);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
-            whisper.close();
+             whisper.freeContext(context);
        }
     }
 }
 ```
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
@ -168,26 +168,23 @@ public class WhisperCpp implements AutoCloseable {
        return str.toString().trim();
    }
-    /**
+    public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
     * Full transcribe with time list.
     *
     * @param whisperParams the whisper params
     * @param audioData     the audio data
     * @return the list
     * @throws IOException the io exception
     */
    public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
        if (ctx == null) {
            throw new IllegalStateException("Model not initialised");
        }
-        if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
+        WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
            lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
        valueParams.read();
        if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
            throw new IOException("Failed to process audio");
        }
        int nSegments = lib.whisper_full_n_segments(ctx);
        List<WhisperSegment> segments= new ArrayList<>(nSegments);
        for (int i = 0; i < nSegments; i++) {
            long t0 = lib.whisper_full_get_segment_t0(ctx, i);
            String text = lib.whisper_full_get_segment_text(ctx, i);
--- a/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
+++ b/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
@ -118,7 +118,7 @@ class WhisperCppTest {
        float[] floats = new float[b.length / 2];
        //WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
-        WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
+        WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
        params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
        params.print_progress = CBool.FALSE;
        //params.initial_prompt = "and so my fellow Americans um, like";
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.7.6",
+  "version": "1.7.5",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/.gitignore
+++ b/bindings/ruby/.gitignore
@ -1,9 +1,6 @@
 LICENSE
 pkg/
 lib/whisper.*
-ext/examples/
+ext/sources/*
-ext/ggml/
+!ext/sources/CMakeGraphVizOptions.cmake
-ext/include/
+ext/mkmf.log
 ext/scripts/
 ext/src/
 test/fixtures/
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@ -24,21 +24,7 @@ or,
    $ gem install whispercpp -- --enable-ggml-cuda
-See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present the README to Ruby-style options, for example:
+See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present the README to Ruby-style options.  
 Boolean options:
 * `-DGGML_BLAS=1` -> `--enable-ggml-blas`
 * `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
 Argument options:
 * `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
 Combination:
 * `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
 For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.  
 For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.
@ -70,6 +56,17 @@ end
 Some models are prepared up-front:
 ```ruby
 base_en = Whisper::Model.pre_converted_models["base.en"]
 whisper = Whisper::Context.new(base_en)
 ```
 At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
 ```ruby
 Whisper::Model.pre_converted_models["base"].clear_cache
 ```
 You also can use shorthand for pre-converted models:
 ```ruby
@ -94,19 +91,6 @@ puts Whisper::Model.pre_converted_models.keys
 #   :
 ```
 You can also retrieve each model:
 ```ruby
 base_en = Whisper::Model.pre_converted_models["base.en"]
 whisper = Whisper::Context.new(base_en)
 ```
 At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
 ```ruby
 Whisper::Model.pre_converted_models["base"].clear_cache
 ```
 You can also use local model files you prepared:
 ```ruby
@ -127,80 +111,9 @@ See [models][] page for details.
 Currently, whisper.cpp accepts only 16-bit WAV files.
 ### Voice Activity Detection (VAD) ###
 Support for Voice Activity Detection (VAD) can be enabled by setting `Whisper::Params`'s `vad` argument to `true` and specifying VAD model:
 ```ruby
 Whisper::Params.new(
  vad: true,
  vad_model_path: "silero-v5.1.2",
  # other arguments...
 )
 ```
 When you pass the model name (`"silero-v5.1.2"`) or URI (`https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin`), it will be downloaded automatically.
 Currently, "silero-v5.1.2" is registered as pre-converted model like ASR models. You also specify file path or URI of model.
 If you need configure VAD behavior, pass params for that:
 ```ruby
 Whisper::Params.new(
  vad: true,
  vad_model_path: "silero-v5.1.2",
  vad_params: Whisper::VAD::Params.new(
    threshold: 1.0, # defaults to 0.5
    min_speech_duration_ms: 500, # defaults to 250
    min_silence_duration_ms: 200, # defaults to 100
    max_speech_duration_s: 30000, # default is FLT_MAX,
    speech_pad_ms: 50, # defaults to 30
    samples_overlap: 0.5 # defaults to 0.1
  ),
  # other arguments...
 )
 ```
 For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
 ### Output ###
 whispercpp supports SRT and WebVTT output:
 ```ruby
 puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
 # =>
 WEBVTT
 1
 00:00:00.000 --> 00:00:03.860
 My thought I have nobody by a beauty and will as you poured.
 2
 00:00:03.860 --> 00:00:09.840
 Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
 3
 00:00:09.840 --> 00:00:09.940
 a
 ```
 You may call `#to_srt`, too
 API
 ---
 ### Transcription ###
 By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option:
 ```ruby
 whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
 ```
 Note that transcription occasionally might be low accuracy when it works in parallel.
 ### Segments ###
 Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
@ -222,7 +135,7 @@ whisper
      ed: format_time(segment.end_time),
      text: segment.text
    }
-    line << " (speaker turned)" if segment.speaker_turn_next?
+    line << " (speaker turned)" if segment.speaker_next_turn?
    puts line
  end
@ -238,7 +151,7 @@ params.on_new_segment do |segment|
    ed: format_time(segment.end_time),
    text: segment.text
  }
-  line << " (speaker turned)" if segment.speaker_turn_next?
+  line << " (speaker turned)" if segment.speaker_next_turn?
  puts line
 end
@ -335,11 +248,6 @@ First call of `rake test` builds an extension and downloads a model for testing.
 If something seems wrong on build, running `rake clean` solves some cases.
 ### Need help ###
 * Windows support
 * Refinement of C/C++ code, especially memory management
 License
 -------
--- a/bindings/ruby/Rakefile
+++ b/bindings/ruby/Rakefile
@ -67,30 +67,17 @@ file LIB_FILE => [SO_FILE, "lib"] do |t|
 end
 CLEAN.include LIB_FILE
-Rake::TestTask.new
+Rake::TestTask.new do |t|
-
+  t.test_files = FileList["tests/test_*.rb"]
 TEST_FIXTURE_AUDIO = "test/fixtures/jfk.wav"
 TEST_FIXTURE_AUDIO_SRC = File.expand_path(File.join(__dir__, "..", "..", "samples", "jfk.wav"))
 TEST_FIXTURE_AUDIO_DIR = TEST_FIXTURE_AUDIO.pathmap("%d")
 directory TEST_FIXTURE_AUDIO_DIR
 if File.exist? TEST_FIXTURE_AUDIO_SRC
  file TEST_FIXTURE_AUDIO => [TEST_FIXTURE_AUDIO_SRC, TEST_FIXTURE_AUDIO_DIR] do |t|
    symlink t.source, t.name
  end
 else
  require "open-uri"
  file TEST_FIXTURE_AUDIO => TEST_FIXTURE_AUDIO_DIR do |t|
    File.write t.name, URI("https://github.com/ggml-org/whisper.cpp/raw/refs/heads/master/samples/jfk.wav").read
  end
 end
-TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
+TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
-file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t|
+file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
-  chdir "test/jfk_reader" do
+  chdir "tests/jfk_reader" do
    ruby "extconf.rb"
    sh "make"
  end
 end
-CLEAN.include TEST_MEMORY_VIEW
+CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
-task test: [LIB_FILE, TEST_MEMORY_VIEW, TEST_FIXTURE_AUDIO]
+task test: [LIB_FILE, TEST_MEMORY_VIEW]
--- a/bindings/ruby/ext/.gitignore
+++ b/bindings/ruby/ext/.gitignore
@ -2,8 +2,10 @@ Makefile
 whisper.so
 whisper.bundle
 whisper.dll
 scripts/get-flags.mk
 *.o
-*.a
+/*/**/*.c
-sources/*
+/*/**/*.cpp
-!sources/CMakeGraphVizOptions.cmake
+/*/**/*.h
-mkmf.log
+/*/**/*.m
 /*/**/*.metal
--- a/bindings/ruby/ext/dependencies.rb
+++ b/bindings/ruby/ext/dependencies.rb
@ -1,32 +1,16 @@
 require "tsort"
 class Dependencies
  include TSort
  def initialize(cmake, options)
    @cmake = cmake
    @options = options
    @static_lib_shape = nil
    @nodes = {}
    @graph = Hash.new {|h, k| h[k] = []}
    generate_dot
-    parse_dot
+    @libs = parse_dot
  end
  def libs
    tsort.filter_map {|node|
      label, shape = @nodes[node]
      if shape == @static_lib_shape
        label.gsub(/\\n\([^)]+\)/, '')
      else
        nil
      end
    }.reverse.collect {|lib| "lib#{lib}.a"}
  end
  def to_s
-    libs.join(" ")
+    @libs.join(" ")
  end
  private
@ -36,38 +20,42 @@ class Dependencies
  end
  def generate_dot
-    args = ["-S", "sources", "-B", "build", "--graphviz", dot_path, "-D", "BUILD_SHARED_LIBS=OFF"]
+    system @cmake, "-S", "sources", "-B", "build", "--graphviz", dot_path, "-D", "BUILD_SHARED_LIBS=OFF", @options.to_s, exception: true
    args << @options.to_s unless @options.to_s.empty?
    system @cmake, *args, exception: true
  end
  def parse_dot
    static_lib_shape = nil
    nodes = {}
    depends = Hash.new {|h, k| h[k] = []}
    class << depends
      include TSort
      alias tsort_each_node each_key
      def tsort_each_child(node, &block)
        fetch(node, []).each(&block)
      end
    end
    File.open(dot_path).each_line do |line|
      case line
      when /\[\s*label\s*=\s*"Static Library"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]/
-        @static_lib_shape = $~[:shape]
+        static_lib_shape = $~[:shape]
      when /\A\s*"(?<node>\w+)"\s*\[\s*label\s*=\s*"(?<label>\S+)"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]\s*;\s*\z/
        node = $~[:node]
        label = $~[:label]
        shape = $~[:shape]
-        @nodes[node] = [label, shape]
+        nodes[node] = [label, shape]
      when /\A\s*"(?<depender>\w+)"\s*->\s*"(?<dependee>\w+)"/
        depender = $~[:depender]
        dependee = $~[:dependee]
-        @graph[depender] << dependee
+        depends[depender] ||= []
        depends[depender] << dependee
      end
    end
-  end
+    depends.tsort.filter_map {|node|
-
+      label, shape = nodes[node]
-  def tsort_each_node
+      shape == static_lib_shape ? label : nil
-    @nodes.each_key do |node|
+    }.collect {|lib| "lib#{lib}.a"}
-      yield node
+      .reverse
    end
  end
  def tsort_each_child(node)
    @graph[node].each do |child|
      yield child
    end
  end
 end
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -3,7 +3,7 @@ require_relative "options"
 require_relative "dependencies"
 cmake = find_executable("cmake") || abort
-options = Options.new(cmake)
+options = Options.new
 have_library("gomp") rescue nil
 libs = Dependencies.new(cmake, options)
--- a/bindings/ruby/ext/options.rb
+++ b/bindings/ruby/ext/options.rb
@ -1,11 +1,25 @@
 class Options
-  def initialize(cmake="cmake")
+  def initialize
    @cmake = cmake
    @options = {}
    @pending_options = []
    @ignored_options = []
    configure
  end
  def help
    @options
      .collect_concat {|name, (type, value)|
        option = option_name(name)
        if type == :bool
          ["--enable-#{option}", "--disable-#{option}"]
        else
          "--#{option}=#{type.upcase}"
        end
      }
      .join($/)
  end
  def to_s
    @options
      .reject {|name, (type, value)| value.nil?}
@ -18,65 +32,188 @@ class Options
    output = nil
    Dir.chdir __dir__ do
-      output = `#{@cmake.shellescape} -S sources -B build -L`
+      output = `cmake -S sources -B build -L`
    end
-    @cmake_options = output.lines.drop_while {|line| line.chomp != "-- Cache values"}.drop(1)
+    started = false
-                       .filter_map {|line|
+    @cmake_options = output.lines.filter_map {|line|
-                         option, value = line.chomp.split("=", 2)
+      if line.chomp == "-- Cache values"
-                         name, type = option.split(":", 2)
+        started = true
-                         [
+        next
-                           name,
+      end
-                           [
+      next unless started
-                             type,
+      option, value = line.chomp.split("=", 2)
-                             type == "BOOL" ? value == "ON" : value
+      name, type = option.split(":", 2)
-                           ]
+      [name, type, value]
-                         ]
+    }
-                       }.to_h
+  end
  def missing_options
    cmake_options.collect {|name, type, value| name} -
      @options.keys - @pending_options - @ignored_options
  end
  def extra_options
    @options.keys + @pending_options + @ignored_options -
      cmake_options.collect {|name, type, value| name}
  end
  private
  def configure
-    cmake_options.each_pair do |name, (type, default_value)|
+    filepath "ACCELERATE_FRAMEWORK"
-      option = option_name(name)
+    ignored "BUILD_SHARED_LIBS"
-      value = type == "BOOL" ? enable_config(option) : arg_config("--#{option}")
+    ignored "BUILD_TESTING"
-      @options[name] = [type, value]
+    ignored "CMAKE_BUILD_TYPE"
-    end
+    ignored "CMAKE_INSTALL_PREFIX"
-
+    string "CMAKE_OSX_ARCHITECTURES"
-    configure_accelerate
+    ignored "CMAKE_OSX_DEPLOYMENT_TARGET"
-    configure_metal
+    string "CMAKE_OSX_SYSROOT"
-    configure_coreml
+    filepath "FOUNDATION_LIBRARY"
-  end
+    bool "GGML_ACCELERATE"
-
+    bool "GGML_ALL_WARNINGS_3RD_PARTY"
-  # See ggml/src/ggml-cpu/CMakeLists.txt
+    bool "GGML_AMX_BF16"
-  def configure_accelerate
+    bool "GGML_AMX_INT8"
-    if RUBY_PLATFORM.match?(/darwin/) && enabled?("GGML_ACCELERATE")
+    bool "GGML_AMX_TILE"
-      $LDFLAGS << " -framework Accelerate"
+    bool "GGML_AVX"
-    end
+    bool "GGML_AVX2"
-  end
+    bool "GGML_AVX512"
-
+    bool "GGML_AVX512_BF16"
-  # See ggml/src/ggml-metal/CMakeLists.txt
+    bool "GGML_AVX512_VBMI"
-  def configure_metal
+    bool "GGML_AVX512_VNNI"
-    $LDFLAGS << " -framework Foundation -framework Metal -framework MetalKit" if enabled?("GGML_METAL")
+    bool "GGML_AVX_VNNI"
-  end
+    ignored "GGML_BACKEND_DL"
-
+    ignored "GGML_BIN_INSTALL_DIR"
-  # See src/CmakeLists.txt
+    bool "GGML_BLAS"
-  def configure_coreml
+    string "GGML_BLAS_VENDOR"
-    if enabled?("WHISPER_COREML")
+    bool "GGML_BMI2"
-      $LDFLAGS << " -framework Foundation -framework CoreML"
+    ignored "GGML_BUILD_EXAMPLES"
-      $CPPFLAGS << " -DRUBY_WHISPER_USE_COREML"
+    ignored "GGML_BUILD_TESTS"
-    end
+    bool "GGML_CCACHE"
    filepath "GGML_CCACHE_FOUND"
    bool "GGML_CPU"
    bool "GGML_CPU_AARCH64"
    ignored "GGML_CPU_ALL_VARIANTS"
    string "GGML_CPU_ARM_ARCH"
    bool "GGML_CPU_HBM"
    bool "GGML_CPU_KLEIDIAI"
    string "GGML_CPU_POWERPC_CPUTYPE"
    bool "GGML_CUDA"
    string "GGML_CUDA_COMPRESSION_MODE"
    bool "GGML_CUDA_F16"
    bool "GGML_CUDA_FA"
    bool "GGML_CUDA_FA_ALL_QUANTS"
    bool "GGML_CUDA_FORCE_CUBLAS"
    bool "GGML_CUDA_FORCE_MMQ"
    ignored "GGML_CUDA_GRAPHS"
    bool "GGML_CUDA_NO_PEER_COPY"
    bool "GGML_CUDA_NO_VMM"
    string "GGML_CUDA_PEER_MAX_BATCH_SIZE"
    bool "GGML_F16C"
    bool "GGML_FMA"
    bool "GGML_GPROF"
    bool "GGML_HIP"
    bool "GGML_HIP_GRAPHS"
    bool "GGML_HIP_NO_VMM"
    bool "GGML_HIP_ROCWMMA_FATTN"
    ignored "GGML_INCLUDE_INSTALL_DIR"
    bool "GGML_KOMPUTE"
    bool "GGML_LASX"
    ignored "GGML_LIB_INSTALL_DIR"
    ignored "GGML_LLAMAFILE"
    bool "GGML_LSX"
    bool "GGML_LTO"
    bool "GGML_METAL"
    bool "GGML_METAL_EMBED_LIBRARY"
    string "GGML_METAL_MACOSX_VERSION_MIN"
    bool "GGML_METAL_NDEBUG"
    bool "GGML_METAL_SHADER_DEBUG"
    string "GGML_METAL_STD"
    bool "GGML_METAL_USE_BF16"
    bool "GGML_MUSA"
    bool "GGML_NATIVE"
    bool "GGML_OPENCL"
    bool "GGML_OPENCL_EMBED_KERNELS"
    bool "GGML_OPENCL_PROFILING"
    string "GGML_OPENCL_TARGET_VERSION"
    bool "GGML_OPENCL_USE_ADRENO_KERNELS"
    bool "GGML_OPENMP"
    bool "GGML_RPC"
    bool "GGML_RVV"
    bool "GGML_RV_ZFH"
    pending "GGML_SCCACHE_FOUND"
    string "GGML_SCHED_MAX_COPIES"
    bool "GGML_SSE42"
    ignored "GGML_STATIC"
    bool "GGML_SYCL"
    string "GGML_SYCL_DEVICE_ARCH"
    bool "GGML_SYCL_F16"
    bool "GGML_SYCL_GRAPH"
    string "GGML_SYCL_TARGET"
    bool "GGML_VULKAN"
    bool "GGML_VULKAN_CHECK_RESULTS"
    bool "GGML_VULKAN_DEBUG"
    bool "GGML_VULKAN_MEMORY_DEBUG"
    bool "GGML_VULKAN_PERF"
    ignored "GGML_VULKAN_RUN_TESTS"
    filepath "GGML_VULKAN_SHADERS_GEN_TOOLCHAIN"
    bool "GGML_VULKAN_SHADER_DEBUG_INFO"
    pending "GGML_VULKAN_VALIDATE"
    bool "GGML_VXE"
    filepath "GIT_EXE"
    filepath "MATH_LIBRARY"
    filepath "METALKIT_FRAMEWORK"
    filepath "METAL_FRAMEWORK"
    bool "WHISPER_ALL_WARNINGS"
    bool "WHISPER_ALL_WARNINGS_3RD_PARTY"
    ignored "WHISPER_BIN_INSTALL_DIR"
    ignored "WHISPER_BUILD_EXAMPLES"
    ignored "WHISPER_BUILD_SERVER"
    ignored"WHISPER_BUILD_TESTS"
    bool "WHISPER_COREML"
    bool "WHISPER_COREML_ALLOW_FALLBACK"
    ignored "WHISPER_CURL"
    bool "WHISPER_FATAL_WARNINGS"
    ignored "WHISPER_FFMPEG"
    ignored "WHISPER_INCLUDE_INSTALL_DIR"
    ignored "WHISPER_LIB_INSTALL_DIR"
    bool "WHISPER_OPENVINO"
    bool "WHISPER_SANITIZE_ADDRESS"
    bool "WHISPER_SANITIZE_THREAD"
    bool "WHISPER_SANITIZE_UNDEFINED"
    ignored "WHISPER_SDL2"
    pending "WHISPER_USE_SYSTEM_GGML"
  end
  def option_name(name)
    name.downcase.gsub("_", "-")
  end
-  def enabled?(option)
+  def bool(name)
-    if @options[option][1].nil?
+    option = option_name(name)
-      cmake_options[option][1]
+    value = enable_config(option)
-    else
+    @options[name] = [:bool, value]
-      @options[option][1]
+  end
-    end
+
  def string(name, type=:string)
    option = "--#{option_name(name)}"
    value = arg_config(option)
    raise "String expected for #{option}" if value == true || value&.empty?
    @options[name] = [type, value]
  end
  def path(name)
    string(name, :path)
  end
  def filepath(name)
    string(name, :filepath)
  end
  def pending(name)
    @pending_options << name
  end
  def ignored(name)
    @ignored_options << name
  end
 end
--- a/bindings/ruby/ext/ruby_whisper.c
+++ b/bindings/ruby/ext/ruby_whisper.c
@ -3,10 +3,8 @@
 #include "ruby_whisper.h"
 VALUE mWhisper;
 VALUE mVAD;
 VALUE cContext;
 VALUE cParams;
 VALUE cVADParams;
 VALUE eError;
 VALUE cSegment;
@ -22,9 +20,6 @@ ID id_new;
 ID id_to_path;
 ID id_URI;
 ID id_pre_converted_models;
 ID id_coreml_compiled_models;
 ID id_cache;
 ID id_n_processors;
 static bool is_log_callback_finalized = false;
@ -36,7 +31,6 @@ extern void init_ruby_whisper_params(VALUE *mWhisper);
 extern void init_ruby_whisper_error(VALUE *mWhisper);
 extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
 extern void init_ruby_whisper_model(VALUE *mWhisper);
 extern void init_ruby_whisper_vad_params(VALUE *mVAD);
 extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
 /*
@ -86,14 +80,6 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
  return rb_str_new2(str_full);
 }
 /*
 * call-seq:
 *   system_info_str -> String
 */
 static VALUE ruby_whisper_s_system_info_str(VALUE self) {
  return rb_str_new2(whisper_print_system_info());
 }
 static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
  is_log_callback_finalized = true;
  return Qnil;
@ -130,6 +116,16 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
  return Qnil;
 }
 static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
  rb_gc_mark(rwm->context);
 }
 static VALUE ruby_whisper_model_allocate(VALUE klass) {
  ruby_whisper_model *rwm;
  rwm = ALLOC(ruby_whisper_model);
  return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
 }
 void Init_whisper() {
  id_to_s = rb_intern("to_s");
  id_call = rb_intern("call");
@ -141,12 +137,8 @@ void Init_whisper() {
  id_to_path = rb_intern("to_path");
  id_URI = rb_intern("URI");
  id_pre_converted_models = rb_intern("pre_converted_models");
  id_coreml_compiled_models = rb_intern("coreml_compiled_models");
  id_cache = rb_intern("cache");
  id_n_processors = rb_intern("n_processors");
  mWhisper = rb_define_module("Whisper");
  mVAD = rb_define_module_under(mWhisper, "VAD");
  rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
  rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
@ -159,7 +151,6 @@ void Init_whisper() {
  rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
  rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
  rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
  rb_define_singleton_method(mWhisper, "system_info_str", ruby_whisper_s_system_info_str, 0);
  rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
  rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);
@ -168,9 +159,6 @@ void Init_whisper() {
  init_ruby_whisper_error(&mWhisper);
  init_ruby_whisper_segment(&mWhisper, &cContext);
  init_ruby_whisper_model(&mWhisper);
  init_ruby_whisper_vad_params(&mVAD);
  rb_require("whisper/context");
  rb_require("whisper/segment");
  rb_require("whisper/model/uri");
 }
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@ -21,13 +21,8 @@ typedef struct {
  ruby_whisper_callback_container *progress_callback_container;
  ruby_whisper_callback_container *encoder_begin_callback_container;
  ruby_whisper_callback_container *abort_callback_container;
  VALUE vad_params;
 } ruby_whisper_params;
 typedef struct {
  struct whisper_vad_params params;
 } ruby_whisper_vad_params;
 typedef struct {
  VALUE context;
  int index;
--- a/bindings/ruby/ext/ruby_whisper_context.c
+++ b/bindings/ruby/ext/ruby_whisper_context.c
@ -11,21 +11,15 @@ extern ID id_new;
 extern ID id_to_path;
 extern ID id_URI;
 extern ID id_pre_converted_models;
 extern ID id_coreml_compiled_models;
 extern ID id_cache;
 extern ID id_n_processors;
 extern VALUE cContext;
 extern VALUE eError;
 extern VALUE cModel;
 extern const rb_data_type_t ruby_whisper_params_type;
 extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
-extern VALUE rb_whisper_model_s_new(VALUE context);
+extern VALUE rb_whisper_model_initialize(VALUE context);
-extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
+extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
-extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
+extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
 ID transcribe_option_names[1];
 static void
 ruby_whisper_free(ruby_whisper *rw)
@ -43,74 +37,19 @@ rb_whisper_mark(ruby_whisper *rw)
 }
 void
-rb_whisper_free(void *p)
+rb_whisper_free(ruby_whisper *rw)
 {
  ruby_whisper *rw = (ruby_whisper *)p;
  ruby_whisper_free(rw);
  free(rw);
 }
 static size_t
 ruby_whisper_memsize(const void *p)
 {
  const ruby_whisper *rw = (const ruby_whisper *)p;
  size_t size = sizeof(rw);
  if (!rw) {
    return 0;
  }
  if (rw->context) {
    size += sizeof(rw->context);
  }
  return size;
 }
 const rb_data_type_t ruby_whisper_type = {
  "ruby_whisper",
  {0, rb_whisper_free, ruby_whisper_memsize,},
  0, 0,
  0
 };
 static VALUE
 ruby_whisper_allocate(VALUE klass)
 {
  ruby_whisper *rw;
-  VALUE obj = TypedData_Make_Struct(klass, ruby_whisper, &ruby_whisper_type, rw);
+  rw = ALLOC(ruby_whisper);
  rw->context = NULL;
-  return obj;
+  return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
 }
 VALUE
 ruby_whisper_normalize_model_path(VALUE model_path)
 {
  VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
  VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
  if (!NIL_P(pre_converted_model)) {
    model_path = pre_converted_model;
 #ifdef RUBY_WHISPER_USE_COREML
    VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0);
    VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model);
    if (!NIL_P(coreml_converted_model)) {
      rb_funcall(coreml_converted_model, id_cache, 0);
    }
 #endif
  }
  else if (TYPE(model_path) == T_STRING) {
    const char * model_path_str = StringValueCStr(model_path);
    if (strncmp("http://", model_path_str, 7) == 0 || strncmp("https://", model_path_str, 8) == 0) {
      VALUE uri_class = rb_const_get(cModel, id_URI);
      model_path = rb_class_new_instance(1, &model_path, uri_class);
    }
  }
  else if (rb_obj_is_kind_of(model_path, rb_path2class("URI::HTTP"))) {
    VALUE uri_class = rb_const_get(cModel, id_URI);
    model_path = rb_class_new_instance(1, &model_path, uri_class);
  }
  if (rb_respond_to(model_path, id_to_path)) {
    model_path = rb_funcall(model_path, id_to_path, 0);
  }
  return model_path;
 }
 /*
@ -127,9 +66,27 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
  // TODO: we can support init from buffer here too maybe another ruby object to expose
  rb_scan_args(argc, argv, "01", &whisper_model_file_path);
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
-  whisper_model_file_path = ruby_whisper_normalize_model_path(whisper_model_file_path);
+  VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
  VALUE pre_converted_model = rb_hash_aref(pre_converted_models, whisper_model_file_path);
  if (!NIL_P(pre_converted_model)) {
    whisper_model_file_path = pre_converted_model;
  }
  if (TYPE(whisper_model_file_path) == T_STRING) {
    const char * whisper_model_file_path_str = StringValueCStr(whisper_model_file_path);
    if (strncmp("http://", whisper_model_file_path_str, 7) == 0 || strncmp("https://", whisper_model_file_path_str, 8) == 0) {
      VALUE uri_class = rb_const_get(cModel, id_URI);
      whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
    }
  }
  if (rb_obj_is_kind_of(whisper_model_file_path, rb_path2class("URI::HTTP"))) {
    VALUE uri_class = rb_const_get(cModel, id_URI);
    whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
  }
  if (rb_respond_to(whisper_model_file_path, id_to_path)) {
    whisper_model_file_path = rb_funcall(whisper_model_file_path, id_to_path, 0);
  }
  if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
    rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
  }
@ -147,7 +104,7 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
 VALUE ruby_whisper_model_n_vocab(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_vocab(rw->context));
 }
@ -158,7 +115,7 @@ VALUE ruby_whisper_model_n_vocab(VALUE self)
 VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
 }
@ -169,7 +126,7 @@ VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
 VALUE ruby_whisper_model_n_audio_state(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_state(rw->context));
 }
@ -180,7 +137,7 @@ VALUE ruby_whisper_model_n_audio_state(VALUE self)
 VALUE ruby_whisper_model_n_audio_head(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_head(rw->context));
 }
@ -191,7 +148,7 @@ VALUE ruby_whisper_model_n_audio_head(VALUE self)
 VALUE ruby_whisper_model_n_audio_layer(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
 }
@ -202,7 +159,7 @@ VALUE ruby_whisper_model_n_audio_layer(VALUE self)
 VALUE ruby_whisper_model_n_text_ctx(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
 }
@ -213,7 +170,7 @@ VALUE ruby_whisper_model_n_text_ctx(VALUE self)
 VALUE ruby_whisper_model_n_text_state(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_state(rw->context));
 }
@ -224,7 +181,7 @@ VALUE ruby_whisper_model_n_text_state(VALUE self)
 VALUE ruby_whisper_model_n_text_head(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_head(rw->context));
 }
@ -235,7 +192,7 @@ VALUE ruby_whisper_model_n_text_head(VALUE self)
 VALUE ruby_whisper_model_n_text_layer(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_layer(rw->context));
 }
@ -246,7 +203,7 @@ VALUE ruby_whisper_model_n_text_layer(VALUE self)
 VALUE ruby_whisper_model_n_mels(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_mels(rw->context));
 }
@ -257,7 +214,7 @@ VALUE ruby_whisper_model_n_mels(VALUE self)
 VALUE ruby_whisper_model_ftype(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_model_ftype(rw->context));
 }
@ -268,7 +225,7 @@ VALUE ruby_whisper_model_ftype(VALUE self)
 VALUE ruby_whisper_model_type(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return rb_str_new2(whisper_model_type_readable(rw->context));
 }
@ -291,9 +248,9 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
  ruby_whisper *rw;
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  VALUE params = argv[0];
-  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(params, ruby_whisper_params, rwp);
  VALUE samples = argv[1];
  int n_samples;
  rb_memory_view_t view;
@ -308,20 +265,13 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
    // Should check when samples.respond_to?(:length)?
  } else {
    if (TYPE(samples) == T_ARRAY) {
-      if (RARRAY_LEN(samples) > INT_MAX) {
+      n_samples = RARRAY_LEN(samples);
        rb_raise(rb_eArgError, "samples are too long");
      }
      n_samples = (int)RARRAY_LEN(samples);
    } else if (memory_view_available_p) {
      if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
        view.obj = Qnil;
        rb_raise(rb_eArgError, "unable to get a memory view");
      }
-      ssize_t n_samples_size = view.byte_size / view.item_size;
+      n_samples = view.byte_size / view.item_size;
      if (n_samples_size > INT_MAX) {
        rb_raise(rb_eArgError, "samples are too long");
      }
      n_samples = (int)n_samples_size;
    } else if (rb_respond_to(samples, id_length)) {
      n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
    } else {
@ -346,7 +296,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
      }
    }
  }
-  prepare_transcription(rwp, &self);
+  register_callbacks(rwp, &self);
  const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
  if (0 == result) {
    return self;
@ -377,9 +327,9 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
  ruby_whisper *rw;
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  VALUE params = argv[0];
-  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(params, ruby_whisper_params, rwp);
  VALUE samples = argv[1];
  int n_samples;
  int n_processors;
@ -409,17 +359,10 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
      view.obj = Qnil;
      rb_raise(rb_eArgError, "unable to get a memory view");
    }
-    ssize_t n_samples_size = view.byte_size / view.item_size;
+    n_samples = view.byte_size / view.item_size;
    if (n_samples_size > INT_MAX) {
      rb_raise(rb_eArgError, "samples are too long");
    }
    n_samples = (int)n_samples_size;
  } else {
    if (TYPE(samples) == T_ARRAY) {
-      if (RARRAY_LEN(samples) > INT_MAX) {
+      n_samples = RARRAY_LEN(samples);
        rb_raise(rb_eArgError, "samples are too long");
      }
      n_samples = (int)RARRAY_LEN(samples);
    } else if (rb_respond_to(samples, id_length)) {
      n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
    } else {
@ -444,7 +387,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
      }
    }
  }
-  prepare_transcription(rwp, &self);
+  register_callbacks(rwp, &self);
  const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
  if (0 == result) {
    return self;
@ -463,7 +406,7 @@ static VALUE
 ruby_whisper_full_n_segments(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_full_n_segments(rw->context));
 }
@ -477,7 +420,7 @@ static VALUE
 ruby_whisper_full_lang_id(VALUE self)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  return INT2NUM(whisper_full_lang_id(rw->context));
 }
@ -502,10 +445,10 @@ static VALUE
 ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
-  return LONG2NUM(t0);
+  return INT2NUM(t0);
 }
 /*
@ -520,10 +463,10 @@ static VALUE
 ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
-  return LONG2NUM(t1);
+  return INT2NUM(t1);
 }
 /*
@ -538,7 +481,7 @@ static VALUE
 ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
  return speaker_turn_next ? Qtrue : Qfalse;
@ -556,7 +499,7 @@ static VALUE
 ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
  return rb_str_new2(text);
@ -570,7 +513,7 @@ static VALUE
 ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
  return DBL2NUM(no_speech_prob);
@ -581,7 +524,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
 static VALUE
 ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
 {
-  return rb_whisper_segment_s_new(self, NUM2INT(i_segment));
+  return rb_whisper_segment_initialize(self, NUM2INT(i_segment));
 }
 /*
@ -611,11 +554,11 @@ ruby_whisper_each_segment(VALUE self)
  }
  ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(self, ruby_whisper, rw);
  const int n_segments = whisper_full_n_segments(rw->context);
  for (int i = 0; i < n_segments; ++i) {
-    rb_yield(rb_whisper_segment_s_new(self, i));
+    rb_yield(rb_whisper_segment_initialize(self, i));
  }
  return self;
@ -628,7 +571,7 @@ ruby_whisper_each_segment(VALUE self)
 static VALUE
 ruby_whisper_get_model(VALUE self)
 {
-  return rb_whisper_model_s_new(self);
+  return rb_whisper_model_initialize(self);
 }
 void
@ -636,8 +579,6 @@ init_ruby_whisper_context(VALUE *mWhisper)
 {
  cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);
  transcribe_option_names[0] = id_n_processors;
  rb_define_alloc_func(cContext, ruby_whisper_allocate);
  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
@ -664,7 +605,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
  rb_define_method(cContext, "full", ruby_whisper_full, -1);
  rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
-  // High level
+  // High leve
  rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
  rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
--- a/bindings/ruby/ext/ruby_whisper_model.c
+++ b/bindings/ruby/ext/ruby_whisper_model.c
@ -1,44 +1,22 @@
 #include <ruby.h>
 #include "ruby_whisper.h"
 extern const rb_data_type_t ruby_whisper_type;
 extern VALUE cModel;
-static void rb_whisper_model_mark(void *p) {
+static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
-  ruby_whisper_model *rwm = (ruby_whisper_model *)p;
+  rb_gc_mark(rwm->context);
  if (rwm->context) {
    rb_gc_mark(rwm->context);
  }
 }
 static size_t
 ruby_whisper_model_memsize(const void *p)
 {
  const ruby_whisper_model *rwm = (const ruby_whisper_model *)p;
  size_t size = sizeof(rwm);
  if (!rwm) {
    return 0;
  }
  return size;
 }
 static const rb_data_type_t rb_whisper_model_type = {
  "ruby_whisper_model",
  {rb_whisper_model_mark, RUBY_DEFAULT_FREE, ruby_whisper_model_memsize,},
  0, 0,
  0
 };
 static VALUE ruby_whisper_model_allocate(VALUE klass) {
  ruby_whisper_model *rwm;
-  return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  rwm = ALLOC(ruby_whisper_model);
  return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
 }
-VALUE rb_whisper_model_s_new(VALUE context) {
+VALUE rb_whisper_model_initialize(VALUE context) {
  ruby_whisper_model *rwm;
  const VALUE model = ruby_whisper_model_allocate(cModel);
-  TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(model, ruby_whisper_model, rwm);
  rwm->context = context;
  return model;
 };
@ -51,9 +29,9 @@ static VALUE
 ruby_whisper_model_n_vocab(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_vocab(rw->context));
 }
@ -65,9 +43,9 @@ static VALUE
 ruby_whisper_model_n_audio_ctx(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
 }
@ -79,9 +57,9 @@ static VALUE
 ruby_whisper_model_n_audio_state(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_state(rw->context));
 }
@ -93,9 +71,9 @@ static VALUE
 ruby_whisper_model_n_audio_head(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_head(rw->context));
 }
@ -107,9 +85,9 @@ static VALUE
 ruby_whisper_model_n_audio_layer(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
 }
@ -121,9 +99,9 @@ static VALUE
 ruby_whisper_model_n_text_ctx(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
 }
@ -135,9 +113,9 @@ static VALUE
 ruby_whisper_model_n_text_state(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_state(rw->context));
 }
@ -149,9 +127,9 @@ static VALUE
 ruby_whisper_model_n_text_head(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_head(rw->context));
 }
@ -163,9 +141,9 @@ static VALUE
 ruby_whisper_model_n_text_layer(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_text_layer(rw->context));
 }
@ -177,9 +155,9 @@ static VALUE
 ruby_whisper_model_n_mels(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_n_mels(rw->context));
 }
@ -191,9 +169,9 @@ static VALUE
 ruby_whisper_model_ftype(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return INT2NUM(whisper_model_ftype(rw->context));
 }
@ -205,9 +183,9 @@ static VALUE
 ruby_whisper_model_type(VALUE self)
 {
  ruby_whisper_model *rwm;
-  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
+  Data_Get_Struct(self, ruby_whisper_model, rwm);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rwm->context, ruby_whisper, rw);
  return rb_str_new2(whisper_model_type_readable(rw->context));
 }
--- a/bindings/ruby/ext/ruby_whisper_params.c
+++ b/bindings/ruby/ext/ruby_whisper_params.c
@ -3,7 +3,7 @@
 #define BOOL_PARAMS_SETTER(self, prop, value) \
  ruby_whisper_params *rwp; \
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
+  Data_Get_Struct(self, ruby_whisper_params, rwp); \
  if (value == Qfalse || value == Qnil) { \
    rwp->params.prop = false; \
  } else { \
@ -13,7 +13,7 @@
 #define BOOL_PARAMS_GETTER(self,  prop) \
  ruby_whisper_params *rwp; \
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
+  Data_Get_Struct(self, ruby_whisper_params, rwp); \
  if (rwp->params.prop) { \
    return Qtrue; \
  } else { \
@ -26,16 +26,13 @@
  rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \
  rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1);
-#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 35
+#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 32
 extern VALUE cParams;
 extern VALUE cVADParams;
 extern ID id_call;
-extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
+extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
 extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
 extern const rb_data_type_t ruby_whisper_vad_params_type;
 static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
 static ID id_language;
@ -70,15 +67,10 @@ static ID id_encoder_begin_callback;
 static ID id_encoder_begin_callback_user_data;
 static ID id_abort_callback;
 static ID id_abort_callback_user_data;
 static ID id_vad;
 static ID id_vad_model_path;
 static ID id_vad_params;
 static void
 rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc)
 {
  if (rwc == NULL) return;
  rb_gc_mark(rwc->user_data);
  rb_gc_mark(rwc->callback);
  rb_gc_mark(rwc->callbacks);
@ -110,7 +102,7 @@ static void new_segment_callback(struct whisper_context *ctx, struct whisper_sta
  const int n_segments = whisper_full_n_segments_from_state(state);
  for (int i = n_new; i > 0; i--) {
    int i_segment = n_segments - i;
-    VALUE segment = rb_whisper_segment_s_new(*container->context, i_segment);
+    VALUE segment = rb_whisper_segment_initialize(*container->context, i_segment);
    for (int j = 0; j < callbacks_len; j++) {
      VALUE cb = rb_ary_entry(container->callbacks, j);
      rb_funcall(cb, id_call, 1, segment);
@ -185,7 +177,7 @@ static bool abort_callback(void * user_data) {
  return false;
 }
-static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
+void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
  if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
    rwp->new_segment_callback_container->context = context;
    rwp->params.new_segment_callback = new_segment_callback;
@ -211,29 +203,13 @@ static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
  }
 }
 static void set_vad_params(ruby_whisper_params *rwp)
 {
  ruby_whisper_vad_params * rwvp;
  TypedData_Get_Struct(rwp->vad_params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwp->params.vad_params = rwvp->params;
 }
 void
-prepare_transcription(ruby_whisper_params *rwp, VALUE *context)
+rb_whisper_params_mark(ruby_whisper_params *rwp)
 {
  register_callbacks(rwp, context);
  set_vad_params(rwp);
 }
 void
 rb_whisper_params_mark(void *p)
 {
  ruby_whisper_params *rwp = (ruby_whisper_params *)p;
  rb_whisper_callbcack_container_mark(rwp->new_segment_callback_container);
  rb_whisper_callbcack_container_mark(rwp->progress_callback_container);
  rb_whisper_callbcack_container_mark(rwp->encoder_begin_callback_container);
  rb_whisper_callbcack_container_mark(rwp->abort_callback_container);
  rb_gc_mark(rwp->vad_params);
 }
 void
@ -242,46 +218,25 @@ ruby_whisper_params_free(ruby_whisper_params *rwp)
 }
 void
-rb_whisper_params_free(void *p)
+rb_whisper_params_free(ruby_whisper_params *rwp)
 {
  ruby_whisper_params *rwp = (ruby_whisper_params *)p;
  // How to free user_data and callback only when not referred to by others?
  ruby_whisper_params_free(rwp);
  free(rwp);
 }
 static size_t
 ruby_whisper_params_memsize(const void *p)
 {
  const ruby_whisper_params *rwp = (const ruby_whisper_params *)p;
  return sizeof(ruby_whisper_params) + sizeof(rwp->params) + sizeof(rwp->vad_params);
 }
 const rb_data_type_t ruby_whisper_params_type = {
  "ruby_whisper_params",
  {
    rb_whisper_params_mark,
    rb_whisper_params_free,
    ruby_whisper_params_memsize,
  },
  0, 0,
  0
 };
 static VALUE
 ruby_whisper_params_allocate(VALUE klass)
 {
  ruby_whisper_params *rwp;
-  VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  rwp = ALLOC(ruby_whisper_params);
  rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
  rwp->diarize = false;
  rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
  rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
  rwp->progress_callback_container = rb_whisper_callback_container_allocate();
  rwp->encoder_begin_callback_container = rb_whisper_callback_container_allocate();
  rwp->abort_callback_container = rb_whisper_callback_container_allocate();
-  return obj;
+  return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
 }
 /*
@ -294,7 +249,7 @@ static VALUE
 ruby_whisper_params_set_language(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  if (value == Qfalse || value == Qnil) {
    rwp->params.language = "auto";
  } else {
@ -310,7 +265,7 @@ static VALUE
 ruby_whisper_params_get_language(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  if (rwp->params.language) {
    return rb_str_new2(rwp->params.language);
  } else {
@ -547,7 +502,7 @@ static VALUE
 ruby_whisper_params_get_initial_prompt(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->params.initial_prompt == NULL ? Qnil : rb_str_new2(rwp->params.initial_prompt);
 }
 /*
@ -558,7 +513,7 @@ static VALUE
 ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.initial_prompt = StringValueCStr(value);
  return value;
 }
@ -572,7 +527,7 @@ static VALUE
 ruby_whisper_params_get_diarize(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  if (rwp->diarize) {
    return Qtrue;
  } else {
@ -587,7 +542,7 @@ static VALUE
 ruby_whisper_params_set_diarize(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  if (value == Qfalse || value == Qnil) {
    rwp->diarize = false;
  } else {
@ -606,7 +561,7 @@ static VALUE
 ruby_whisper_params_get_offset(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return INT2NUM(rwp->params.offset_ms);
 }
 /*
@ -617,7 +572,7 @@ static VALUE
 ruby_whisper_params_set_offset(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.offset_ms = NUM2INT(value);
  return value;
 }
@ -631,7 +586,7 @@ static VALUE
 ruby_whisper_params_get_duration(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return INT2NUM(rwp->params.duration_ms);
 }
 /*
@ -642,7 +597,7 @@ static VALUE
 ruby_whisper_params_set_duration(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.duration_ms = NUM2INT(value);
  return value;
 }
@ -657,7 +612,7 @@ static VALUE
 ruby_whisper_params_get_max_text_tokens(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return INT2NUM(rwp->params.n_max_text_ctx);
 }
 /*
@ -668,7 +623,7 @@ static VALUE
 ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.n_max_text_ctx = NUM2INT(value);
  return value;
 }
@ -680,7 +635,7 @@ static VALUE
 ruby_whisper_params_get_temperature(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return DBL2NUM(rwp->params.temperature);
 }
 /*
@ -691,7 +646,7 @@ static VALUE
 ruby_whisper_params_set_temperature(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.temperature = RFLOAT_VALUE(value);
  return value;
 }
@ -705,7 +660,7 @@ static VALUE
 ruby_whisper_params_get_max_initial_ts(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return DBL2NUM(rwp->params.max_initial_ts);
 }
 /*
@ -716,7 +671,7 @@ static VALUE
 ruby_whisper_params_set_max_initial_ts(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.max_initial_ts = RFLOAT_VALUE(value);
  return value;
 }
@ -728,7 +683,7 @@ static VALUE
 ruby_whisper_params_get_length_penalty(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return DBL2NUM(rwp->params.length_penalty);
 }
 /*
@ -739,7 +694,7 @@ static VALUE
 ruby_whisper_params_set_length_penalty(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.length_penalty = RFLOAT_VALUE(value);
  return value;
 }
@ -751,7 +706,7 @@ static VALUE
 ruby_whisper_params_get_temperature_inc(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return DBL2NUM(rwp->params.temperature_inc);
 }
 /*
@ -762,7 +717,7 @@ static VALUE
 ruby_whisper_params_set_temperature_inc(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.temperature_inc = RFLOAT_VALUE(value);
  return value;
 }
@ -776,7 +731,7 @@ static VALUE
 ruby_whisper_params_get_entropy_thold(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return DBL2NUM(rwp->params.entropy_thold);
 }
 /*
@ -787,7 +742,7 @@ static VALUE
 ruby_whisper_params_set_entropy_thold(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.entropy_thold = RFLOAT_VALUE(value);
  return value;
 }
@ -799,7 +754,7 @@ static VALUE
 ruby_whisper_params_get_logprob_thold(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return DBL2NUM(rwp->params.logprob_thold);
 }
 /*
@ -810,7 +765,7 @@ static VALUE
 ruby_whisper_params_set_logprob_thold(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.logprob_thold = RFLOAT_VALUE(value);
  return value;
 }
@ -822,7 +777,7 @@ static VALUE
 ruby_whisper_params_get_no_speech_thold(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return DBL2NUM(rwp->params.no_speech_thold);
 }
 /*
@ -833,7 +788,7 @@ static VALUE
 ruby_whisper_params_set_no_speech_thold(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->params.no_speech_thold = RFLOAT_VALUE(value);
  return value;
 }
@ -841,7 +796,7 @@ static VALUE
 ruby_whisper_params_get_new_segment_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->new_segment_callback_container->callback;
 }
 /*
@ -858,7 +813,7 @@ static VALUE
 ruby_whisper_params_set_new_segment_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->new_segment_callback_container->callback = value;
  return value;
 }
@ -866,7 +821,7 @@ static VALUE
 ruby_whisper_params_get_new_segment_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->new_segment_callback_container->user_data;
 }
 /*
@ -879,7 +834,7 @@ static VALUE
 ruby_whisper_params_set_new_segment_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->new_segment_callback_container->user_data = value;
  return value;
 }
@ -887,7 +842,7 @@ static VALUE
 ruby_whisper_params_get_progress_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->progress_callback_container->callback;
 }
 /*
@ -906,7 +861,7 @@ static VALUE
 ruby_whisper_params_set_progress_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->progress_callback_container->callback = value;
  return value;
 }
@ -914,7 +869,7 @@ static VALUE
 ruby_whisper_params_get_progress_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->progress_callback_container->user_data;
 }
 /*
@ -927,7 +882,7 @@ static VALUE
 ruby_whisper_params_set_progress_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->progress_callback_container->user_data = value;
  return value;
 }
@ -936,7 +891,7 @@ static VALUE
 ruby_whisper_params_get_encoder_begin_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->encoder_begin_callback_container->callback;
 }
@ -954,7 +909,7 @@ static VALUE
 ruby_whisper_params_set_encoder_begin_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->encoder_begin_callback_container->callback = value;
  return value;
 }
@ -963,7 +918,7 @@ static VALUE
 ruby_whisper_params_get_encoder_begin_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->encoder_begin_callback_container->user_data;
 }
@ -977,7 +932,7 @@ static VALUE
 ruby_whisper_params_set_encoder_begin_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->encoder_begin_callback_container->user_data = value;
  return value;
 }
@ -986,7 +941,7 @@ static VALUE
 ruby_whisper_params_get_abort_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->abort_callback_container->callback;
 }
 /*
@ -1003,7 +958,7 @@ static VALUE
 ruby_whisper_params_set_abort_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->abort_callback_container->callback = value;
  return value;
 }
@ -1011,7 +966,7 @@ static VALUE
 ruby_whisper_params_get_abort_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  return rwp->abort_callback_container->user_data;
 }
 /*
@ -1024,74 +979,11 @@ static VALUE
 ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  rwp->abort_callback_container->user_data = value;
  return value;
 }
 /*
 * call-seq:
 *   vad = use_vad -> use_vad
 */
 static VALUE
 ruby_whisper_params_get_vad(VALUE self)
 {
  BOOL_PARAMS_GETTER(self, vad)
 }
 static VALUE
 ruby_whisper_params_set_vad(VALUE self, VALUE value)
 {
  BOOL_PARAMS_SETTER(self, vad, value)
 }
 /*
 * call-seq:
 *   vad_model_path = model_path -> model_path
 */
 static VALUE
 ruby_whisper_params_set_vad_model_path(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  if (NIL_P(value)) {
    rwp->params.vad_model_path = NULL;
    return value;
  }
  VALUE path = ruby_whisper_normalize_model_path(value);
  rwp->params.vad_model_path = StringValueCStr(path);
  return value;
 }
 static VALUE
 ruby_whisper_params_get_vad_model_path(VALUE self)
 {
  ruby_whisper_params *rwp;
  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->params.vad_model_path == NULL ? Qnil : rb_str_new2(rwp->params.vad_model_path);
 }
 /*
 * call-seq:
 *   vad_params = params -> params
 */
 static VALUE
 ruby_whisper_params_set_vad_params(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->vad_params = value;
  return value;
 }
 static VALUE
 ruby_whisper_params_get_vad_params(VALUE self)
 {
  ruby_whisper_params *rwp;
  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->vad_params;
 }
 #define SET_PARAM_IF_SAME(param_name) \
  if (id == id_ ## param_name) { \
    ruby_whisper_params_set_ ## param_name(self, value); \
@ -1101,6 +993,7 @@ ruby_whisper_params_get_vad_params(VALUE self)
 static VALUE
 ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
 {
  VALUE kw_hash;
  VALUE values[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT] = {Qundef};
  VALUE value;
@ -1114,7 +1007,7 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
  }
  rb_get_kwargs(kw_hash, param_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, values);
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rwp);
  for (i = 0; i < RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT; i++) {
    id = param_names[i];
@ -1157,9 +1050,6 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
      SET_PARAM_IF_SAME(encoder_begin_callback_user_data)
      SET_PARAM_IF_SAME(abort_callback)
      SET_PARAM_IF_SAME(abort_callback_user_data)
      SET_PARAM_IF_SAME(vad)
      SET_PARAM_IF_SAME(vad_model_path)
      SET_PARAM_IF_SAME(vad_params)
    }
  }
@ -1181,10 +1071,10 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
 static VALUE
 ruby_whisper_params_on_new_segment(VALUE self)
 {
-  ruby_whisper_params *rwp;
+  ruby_whisper_params *rws;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rws);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rwp->new_segment_callback_container->callbacks, blk);
+  rb_ary_push(rws->new_segment_callback_container->callbacks, blk);
  return Qnil;
 }
@ -1201,10 +1091,10 @@ ruby_whisper_params_on_new_segment(VALUE self)
 static VALUE
 ruby_whisper_params_on_progress(VALUE self)
 {
-  ruby_whisper_params *rwp;
+  ruby_whisper_params *rws;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rws);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rwp->progress_callback_container->callbacks, blk);
+  rb_ary_push(rws->progress_callback_container->callbacks, blk);
  return Qnil;
 }
@ -1221,10 +1111,10 @@ ruby_whisper_params_on_progress(VALUE self)
 static VALUE
 ruby_whisper_params_on_encoder_begin(VALUE self)
 {
-  ruby_whisper_params *rwp;
+  ruby_whisper_params *rws;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rws);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rwp->encoder_begin_callback_container->callbacks, blk);
+  rb_ary_push(rws->encoder_begin_callback_container->callbacks, blk);
  return Qnil;
 }
@ -1245,10 +1135,10 @@ ruby_whisper_params_on_encoder_begin(VALUE self)
 static VALUE
 ruby_whisper_params_abort_on(VALUE self)
 {
-  ruby_whisper_params *rwp;
+  ruby_whisper_params *rws;
-  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  Data_Get_Struct(self, ruby_whisper_params, rws);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rwp->abort_callback_container->callbacks, blk);
+  rb_ary_push(rws->abort_callback_container->callbacks, blk);
  return Qnil;
 }
@ -1292,9 +1182,6 @@ init_ruby_whisper_params(VALUE *mWhisper)
  DEFINE_PARAM(encoder_begin_callback_user_data, 29)
  DEFINE_PARAM(abort_callback, 30)
  DEFINE_PARAM(abort_callback_user_data, 31)
  DEFINE_PARAM(vad, 32)
  DEFINE_PARAM(vad_model_path, 33)
  DEFINE_PARAM(vad_params, 34)
  rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0);
  rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0);
--- a/bindings/ruby/ext/ruby_whisper_segment.c
+++ b/bindings/ruby/ext/ruby_whisper_segment.c
@ -1,57 +1,28 @@
 #include <ruby.h>
 #include "ruby_whisper.h"
 #define N_KEY_NAMES 5
 static VALUE sym_start_time;
 static VALUE sym_end_time;
 static VALUE sym_text;
 static VALUE sym_no_speech_prob;
 static VALUE sym_speaker_turn_next;
 static VALUE key_names;
 extern const rb_data_type_t ruby_whisper_type;
 extern VALUE cSegment;
 static void
-rb_whisper_segment_mark(void *p)
+rb_whisper_segment_mark(ruby_whisper_segment *rws)
 {
  ruby_whisper_segment *rws = (ruby_whisper_segment *)p;
  rb_gc_mark(rws->context);
 }
 static size_t
 ruby_whisper_segment_memsize(const void *p)
 {
  const ruby_whisper_segment *rws = (const ruby_whisper_segment *)p;
  size_t size = sizeof(rws);
  if (!rws) {
    return 0;
  }
  return size;
 }
 static const rb_data_type_t ruby_whisper_segment_type = {
  "ruby_whisper_segment",
  {rb_whisper_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_segment_memsize,},
  0, 0,
  0
 };
 VALUE
 ruby_whisper_segment_allocate(VALUE klass)
 {
  ruby_whisper_segment *rws;
-  return TypedData_Make_Struct(klass, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  rws = ALLOC(ruby_whisper_segment);
  return Data_Wrap_Struct(klass, rb_whisper_segment_mark, RUBY_DEFAULT_FREE, rws);
 }
 VALUE
-rb_whisper_segment_s_new(VALUE context, int index)
+rb_whisper_segment_initialize(VALUE context, int index)
 {
  ruby_whisper_segment *rws;
  const VALUE segment = ruby_whisper_segment_allocate(cSegment);
-  TypedData_Get_Struct(segment, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  Data_Get_Struct(segment, ruby_whisper_segment, rws);
  rws->context = context;
  rws->index = index;
  return segment;
@ -67,12 +38,12 @@ static VALUE
 ruby_whisper_segment_get_start_time(VALUE self)
 {
  ruby_whisper_segment *rws;
-  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  Data_Get_Struct(self, ruby_whisper_segment, rws);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rws->context, ruby_whisper, rw);
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
-  return LONG2NUM(t0 * 10);
+  return INT2NUM(t0 * 10);
 }
 /*
@ -85,12 +56,12 @@ static VALUE
 ruby_whisper_segment_get_end_time(VALUE self)
 {
  ruby_whisper_segment *rws;
-  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  Data_Get_Struct(self, ruby_whisper_segment, rws);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rws->context, ruby_whisper, rw);
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
-  return LONG2NUM(t1 * 10);
+  return INT2NUM(t1 * 10);
 }
 /*
@ -103,9 +74,9 @@ static VALUE
 ruby_whisper_segment_get_speaker_turn_next(VALUE self)
 {
  ruby_whisper_segment *rws;
-  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  Data_Get_Struct(self, ruby_whisper_segment, rws);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rws->context, ruby_whisper, rw);
  return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
 }
@ -117,9 +88,9 @@ static VALUE
 ruby_whisper_segment_get_text(VALUE self)
 {
  ruby_whisper_segment *rws;
-  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  Data_Get_Struct(self, ruby_whisper_segment, rws);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rws->context, ruby_whisper, rw);
  const char * text = whisper_full_get_segment_text(rw->context, rws->index);
  return rb_str_new2(text);
 }
@ -132,89 +103,21 @@ static VALUE
 ruby_whisper_segment_get_no_speech_prob(VALUE self)
 {
  ruby_whisper_segment *rws;
-  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  Data_Get_Struct(self, ruby_whisper_segment, rws);
  ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  Data_Get_Struct(rws->context, ruby_whisper, rw);
  return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
 }
 /*
 * call-seq:
 *   deconstruct_keys(keys) -> hash
 *
 *  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
 *
 *   whisper.each_segment do |segment|
 *     segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
 *
 *     puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
 *   end
 */
 static VALUE
 ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
 {
  ruby_whisper_segment *rws;
  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
  ruby_whisper *rw;
  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
  VALUE hash = rb_hash_new();
  long n_keys;
  if (NIL_P(keys)) {
    keys = key_names;
    n_keys = N_KEY_NAMES;
  } else {
    n_keys = RARRAY_LEN(keys);
    if (n_keys > N_KEY_NAMES) {
      return hash;
    }
  }
  for (int i = 0; i < n_keys; i++) {
    VALUE key = rb_ary_entry(keys, i);
    if (key == sym_start_time) {
      rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
    }
    if (key == sym_end_time) {
      rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
    }
    if (key == sym_text) {
      rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
    }
    if (key == sym_no_speech_prob) {
      rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
    }
    if (key == sym_speaker_turn_next) {
      rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
    }
  }
  return hash;
 }
 void
 init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
 {
  cSegment  = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
  sym_start_time = ID2SYM(rb_intern("start_time"));
  sym_end_time = ID2SYM(rb_intern("end_time"));
  sym_text = ID2SYM(rb_intern("text"));
  sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
  sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
  key_names = rb_ary_new3(
    N_KEY_NAMES,
    sym_start_time,
    sym_end_time,
    sym_text,
    sym_no_speech_prob,
    sym_speaker_turn_next
  );
  rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
  rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
  rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
-  rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
+  rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
  rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
  rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
  rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
 }
--- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp
+++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp
@ -8,15 +8,11 @@
 extern "C" {
 #endif
 extern const rb_data_type_t ruby_whisper_type;
 extern const rb_data_type_t ruby_whisper_params_type;
 extern ID id_to_s;
 extern ID id_call;
 extern ID transcribe_option_names[1];
 extern void
-prepare_transcription(ruby_whisper_params * rwp, VALUE * self);
+register_callbacks(ruby_whisper_params * rwp, VALUE * self);
 /*
 * transcribe a single file
@ -35,16 +31,11 @@ VALUE
 ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  ruby_whisper *rw;
  ruby_whisper_params *rwp;
-  VALUE wave_file_path, blk, params, kws;
+  VALUE wave_file_path, blk, params;
  VALUE opts[1];
-  rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, &params, &kws, &blk);
+  rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
-  rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
+  Data_Get_Struct(self, ruby_whisper, rw);
-
+  Data_Get_Struct(params, ruby_whisper_params, rwp);
  int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  if (!rb_respond_to(wave_file_path, id_to_s)) {
    rb_raise(rb_eRuntimeError, "Expected file path to wave file");
@ -70,22 +61,22 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  //   rwp->params.encoder_begin_callback_user_data = &is_aborted;
  // }
-  prepare_transcription(rwp, &self);
+  register_callbacks(rwp, &self);
-  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
+  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
    fprintf(stderr, "failed to process audio\n");
    return self;
  }
  if (NIL_P(blk)) {
    return self;
  }
  const int n_segments = whisper_full_n_segments(rw->context);
  VALUE output = rb_str_new2("");
  for (int i = 0; i < n_segments; ++i) {
    const char * text = whisper_full_get_segment_text(rw->context, i);
    output = rb_str_concat(output, rb_str_new2(text));
  }
-  rb_funcall(blk, id_call, 1, output);
+  VALUE idCall = id_call;
  if (blk != Qnil) {
    rb_funcall(blk, idCall, 1, output);
  }
  return self;
 }
 #ifdef __cplusplus
--- a/bindings/ruby/ext/ruby_whisper_vad_params.c
+++ b/bindings/ruby/ext/ruby_whisper_vad_params.c
@ -1,288 +0,0 @@
 #include <ruby.h>
 #include "ruby_whisper.h"
 #define DEFINE_PARAM(param_name, nth) \
  id_ ## param_name = rb_intern(#param_name); \
  param_names[nth] = id_ ## param_name; \
  rb_define_method(cVADParams, #param_name, ruby_whisper_vad_params_get_ ## param_name, 0); \
  rb_define_method(cVADParams, #param_name "=", ruby_whisper_vad_params_set_ ## param_name, 1);
 #define NUM_PARAMS 6
 extern VALUE cVADParams;
 static size_t
 ruby_whisper_vad_params_memsize(const void *p)
 {
  const struct ruby_whisper_vad_params *params = p;
  size_t size = sizeof(params);
  if (!params) {
    return 0;
  }
  return size;
 }
 static ID param_names[NUM_PARAMS];
 static ID id_threshold;
 static ID id_min_speech_duration_ms;
 static ID id_min_silence_duration_ms;
 static ID id_max_speech_duration_s;
 static ID id_speech_pad_ms;
 static ID id_samples_overlap;
 const rb_data_type_t ruby_whisper_vad_params_type = {
  "ruby_whisper_vad_params",
  {0, 0, ruby_whisper_vad_params_memsize,},
  0, 0,
  0
 };
 static VALUE
 ruby_whisper_vad_params_s_allocate(VALUE klass)
 {
  ruby_whisper_vad_params *rwvp;
  VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwvp->params = whisper_vad_default_params();
  return obj;
 }
 /*
 * Probability threshold to consider as speech.
 *
 * call-seq:
 *   threshold = th -> th
 */
 static VALUE
 ruby_whisper_vad_params_set_threshold(VALUE self, VALUE value)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwvp->params.threshold = RFLOAT_VALUE(value);
  return value;
 }
 static VALUE
 ruby_whisper_vad_params_get_threshold(VALUE self)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  return DBL2NUM(rwvp->params.threshold);
 }
 /*
 * Min duration for a valid speech segment.
 *
 * call-seq:
 *   min_speech_duration_ms = duration_ms -> duration_ms
 */
 static VALUE
 ruby_whisper_vad_params_set_min_speech_duration_ms(VALUE self, VALUE value)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwvp->params.min_speech_duration_ms = NUM2INT(value);
  return value;
 }
 static VALUE
 ruby_whisper_vad_params_get_min_speech_duration_ms(VALUE self)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  return INT2NUM(rwvp->params.min_speech_duration_ms);
 }
 /*
 * Min silence duration to consider speech as ended.
 *
 * call-seq:
 *   min_silence_duration_ms = duration_ms -> duration_ms
 */
 static VALUE
 ruby_whisper_vad_params_set_min_silence_duration_ms(VALUE self, VALUE value)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwvp->params.min_silence_duration_ms = NUM2INT(value);
  return value;
 }
 static VALUE
 ruby_whisper_vad_params_get_min_silence_duration_ms(VALUE self)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  return INT2NUM(rwvp->params.min_silence_duration_ms);
 }
 /*
 * Max duration of a speech segment before forcing a new segment.
 *
 * call-seq:
 *   max_speech_duration_s = duration_s -> duration_s
 */
 static VALUE
 ruby_whisper_vad_params_set_max_speech_duration_s(VALUE self, VALUE value)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwvp->params.max_speech_duration_s = RFLOAT_VALUE(value);
  return value;
 }
 static VALUE
 ruby_whisper_vad_params_get_max_speech_duration_s(VALUE self)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  return DBL2NUM(rwvp->params.max_speech_duration_s);
 }
 /*
 * Padding added before and after speech segments.
 *
 * call-seq:
 *   speech_pad_ms = pad_ms -> pad_ms
 */
 static VALUE
 ruby_whisper_vad_params_set_speech_pad_ms(VALUE self, VALUE value)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwvp->params.speech_pad_ms = NUM2INT(value);
  return value;
 }
 static VALUE
 ruby_whisper_vad_params_get_speech_pad_ms(VALUE self)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  return INT2NUM(rwvp->params.speech_pad_ms);
 }
 /*
 * Overlap in seconds when copying audio samples from speech segment.
 *
 * call-seq:
 *   samples_overlap = overlap -> overlap
 */
 static VALUE
 ruby_whisper_vad_params_set_samples_overlap(VALUE self, VALUE value)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rwvp->params.samples_overlap = RFLOAT_VALUE(value);
  return value;
 }
 static VALUE
 ruby_whisper_vad_params_get_samples_overlap(VALUE self)
 {
  ruby_whisper_vad_params *rwvp;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  return DBL2NUM(rwvp->params.samples_overlap);
 }
 static VALUE
 ruby_whisper_vad_params_equal(VALUE self, VALUE other)
 {
  ruby_whisper_vad_params *rwvp1;
  ruby_whisper_vad_params *rwvp2;
  if (self == other) {
    return Qtrue;
  }
  if (!rb_obj_is_kind_of(other, cVADParams)) {
    return Qfalse;
  }
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp1);
  TypedData_Get_Struct(other, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp2);
  if (rwvp1->params.threshold != rwvp2->params.threshold) {
    return Qfalse;
  }
  if (rwvp1->params.min_speech_duration_ms != rwvp2->params.min_speech_duration_ms) {
    return Qfalse;
  }
  if (rwvp1->params.min_silence_duration_ms != rwvp2->params.min_silence_duration_ms) {
    return Qfalse;
  }
  if (rwvp1->params.max_speech_duration_s != rwvp2->params.max_speech_duration_s) {
    return Qfalse;
  }
  if (rwvp1->params.speech_pad_ms != rwvp2->params.speech_pad_ms) {
    return Qfalse;
  }
  if (rwvp1->params.samples_overlap != rwvp2->params.samples_overlap) {
    return Qfalse;
  }
  return Qtrue;
 }
 #define SET_PARAM_IF_SAME(param_name) \
  if (id == id_ ## param_name) { \
    ruby_whisper_vad_params_set_ ## param_name(self, value); \
    continue; \
  }
 VALUE
 ruby_whisper_vad_params_initialize(int argc, VALUE *argv, VALUE self)
 {
  VALUE kw_hash;
  VALUE values[NUM_PARAMS] = {Qundef};
  VALUE value;
  ruby_whisper_vad_params *rwvp;
  ID id;
  int i;
  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
  rb_scan_args_kw(RB_SCAN_ARGS_KEYWORDS, argc, argv, ":", &kw_hash);
  if (NIL_P(kw_hash)) {
    return self;
  }
  rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
  for (i = 0; i < NUM_PARAMS; i++) {
    id = param_names[i];
    value = values[i];
    if (value == Qundef) {
      continue;
    }
    SET_PARAM_IF_SAME(threshold)
    SET_PARAM_IF_SAME(min_speech_duration_ms)
    SET_PARAM_IF_SAME(min_silence_duration_ms)
    SET_PARAM_IF_SAME(max_speech_duration_s)
    SET_PARAM_IF_SAME(speech_pad_ms)
    SET_PARAM_IF_SAME(samples_overlap)
  }
  return self;
 }
 #undef SET_PARAM_IF_SAME
 void
 init_ruby_whisper_vad_params(VALUE *mVAD)
 {
  cVADParams = rb_define_class_under(*mVAD, "Params", rb_cObject);
  rb_define_alloc_func(cVADParams, ruby_whisper_vad_params_s_allocate);
  rb_define_method(cVADParams, "initialize", ruby_whisper_vad_params_initialize, -1);
  DEFINE_PARAM(threshold, 0)
  DEFINE_PARAM(min_speech_duration_ms, 1)
  DEFINE_PARAM(min_silence_duration_ms, 2)
  DEFINE_PARAM(max_speech_duration_s, 3)
  DEFINE_PARAM(speech_pad_ms, 4)
  DEFINE_PARAM(samples_overlap, 5)
  rb_define_method(cVADParams, "==", ruby_whisper_vad_params_equal, 1);
 }
 #undef DEFINE_PARAM
 #undef NUM_PARAMS
--- a/bindings/ruby/extsources.rb
+++ b/bindings/ruby/extsources.rb
@ -1,10 +1,5 @@
 require "pathname"
 root = Pathname("..")/".."
 ignored_dirs = %w[
  .devops
  .github
  ci
  examples/wchess/wchess.wasm
  examples/whisper.android
  examples/whisper.android.java
@ -14,7 +9,7 @@ ignored_dirs = %w[
  models
  samples
  scripts
-].collect {|dir| root/dir}
+]
 ignored_files = %w[
  AUTHORS
  Makefile
@ -22,19 +17,18 @@ ignored_files = %w[
  README_sycl.md
  .gitignore
  .gitmodules
  .dockerignore
  whisper.nvim
  twitch.sh
  yt-wsp.sh
  close-issue.yml
 ]
 EXTSOURCES =
-  `git ls-files -z #{root}`.split("\x0")
+  `git ls-files -z ../..`.split("\x0")
-    .collect {|file| Pathname(file)}
+    .select {|file|
-    .reject {|file|
+      basename = File.basename(file)
-      ignored_dirs.any? {|dir| file.descend.any? {|desc| desc == dir}} ||
+
-        ignored_files.include?(file.basename.to_path) ||
+      ignored_dirs.all? {|dir| !file.start_with?("../../#{dir}")} &&
-        (file.descend.to_a[1] != root && file.descend.to_a[1] != Pathname("..")/"javascript")
+        !ignored_files.include?(basename) &&
        (file.start_with?("../..") || file.start_with?("../javascript")) &&
        (!file.start_with?("../../.github/") || basename == "bindings-ruby.yml")
    }
    .collect(&:to_path)
--- a/bindings/ruby/lib/whisper/context.rb
+++ b/bindings/ruby/lib/whisper/context.rb
@ -1,15 +0,0 @@
 module Whisper
  class Context
    def to_srt
      each_segment.with_index.reduce("") {|srt, (segment, index)|
        srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
      }
    end
    def to_webvtt
      each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
        webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
      }
    end
  end
 end
--- a/bindings/ruby/lib/whisper/model/uri.rb
+++ b/bindings/ruby/lib/whisper/model/uri.rb
@ -130,44 +130,6 @@ module Whisper
      end
    end
    class ZipURI < URI
      def cache
        zip_path = super
        dest = unzipped_path
        return if dest.exist? && dest.mtime >= zip_path.mtime
        escaping dest do
          system "unzip", "-q", "-d", zip_path.dirname.to_path, zip_path.to_path, exception: true
        end
        zip_path
      end
      def clear_cache
        super
        unzipped_path.rmtree if unzipped_path.exist?
      end
      private
      def unzipped_path
        cache_path.sub_ext("")
      end
      def escaping(path)
        escaped = Pathname("#{path}.removing")
        if path.exist?
          escaped.rmtree if escaped.exist?
          path.rename escaped
        end
        yield
      ensure
        if path.exist?
          escaped.rmtree if escaped.exist?
        else
          escaped.rename path if escaped.exist?
        end
      end
    end
    @pre_converted_models = %w[
      tiny
      tiny.en
@ -203,31 +165,8 @@ module Whisper
      models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
    }
    %w[
      silero-v5.1.2
    ].each do |name|
      @pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
    end
    @coreml_compiled_models = %w[
      tiny
      tiny.en
      base
      base.en
      small
      small.en
      medium
      medium.en
      large-v1
      large-v2
      large-v3
      large-v3-turbo
    ].each_with_object({}) do |name, models|
      models[@pre_converted_models[name]] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
    end
    class << self
-      attr_reader :pre_converted_models, :coreml_compiled_models
+      attr_reader :pre_converted_models
    end
  end
 end
--- a/bindings/ruby/lib/whisper/segment.rb
+++ b/bindings/ruby/lib/whisper/segment.rb
@ -1,58 +0,0 @@
 module Whisper
  class Segment
    SRT_ESCAPES = {
      "&" => "&amp;",
      "<" => "&lt;",
      ">" => "&gt;",
    }
    SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
    private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
    def to_srt_cue
      "#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
    end
    def to_webvtt_cue
      "#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
    end
    private
    def time_to_a(time)
      sec, decimal_part = time.divmod(1000)
      min, sec = sec.divmod(60)
      hour, min = min.divmod(60)
      [hour, min, sec, decimal_part]
    end
    def srt_time(time)
      "%02d:%02d:%02d,%03d" % time_to_a(time)
    end
    def srt_start_time
      srt_time(start_time)
    end
    def srt_end_time
      srt_time(end_time)
    end
    def srt_text
      text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
    end
    def webvtt_time(time)
      "%02d:%02d:%02d.%03d" % time_to_a(time)
    end
    def webvtt_start_time
      webvtt_time(start_time)
    end
    def webvtt_end_time
      webvtt_time(end_time)
    end
    alias webvtt_text srt_text
  end
 end
--- a/bindings/ruby/sig/whisper.rbs
+++ b/bindings/ruby/sig/whisper.rbs
@ -22,22 +22,21 @@ module Whisper
  def self.lang_str: (Integer id) -> String
  def self.lang_str_full: (Integer id) -> String
  def self.log_set: (log_callback, Object? user_data) -> log_callback
  def self.system_info_str: () -> String
  class Context
-    def self.new: (String | path | ::URI::HTTP) -> instance
+    def self.new: (path | ::URI::HTTP) -> instance
    # transcribe a single file
    # can emit to a block results
    #
-    #     params = Whisper::Params.new
+    #   params = Whisper::Params.new
-    #     params.duration = 60_000
+    #   params.duration = 60_000
-    #     whisper.transcribe "path/to/audio.wav", params do |text|
+    #   whisper.transcribe "path/to/audio.wav", params do |text|
-    #       puts text
+    #     puts text
-    #     end
+    #   end
    #
-    def transcribe: (string, Params, ?n_processors: Integer) -> self
+    def transcribe: (string, Params) -> self
-                  | (string, Params, ?n_processors: Integer) { (String) -> void } -> self
+                  | (string, Params) { (String) -> void } -> self
    def model_n_vocab: () -> Integer
    def model_n_audio_ctx: () -> Integer
@ -50,16 +49,16 @@ module Whisper
    # Yields each Whisper::Segment:
    #
-    #     whisper.transcribe("path/to/audio.wav", params)
+    #   whisper.transcribe("path/to/audio.wav", params)
-    #     whisper.each_segment do |segment|
+    #   whisper.each_segment do |segment|
-    #       puts segment.text
+    #     puts segment.text
-    #     end
+    #   end
    #
    # Returns an Enumerator if no block given:
    #
-    #     whisper.transcribe("path/to/audio.wav", params)
+    #   whisper.transcribe("path/to/audio.wav", params)
-    #     enum = whisper.each_segment
+    #   enum = whisper.each_segment
-    #     enum.to_a # => [#<Whisper::Segment>, ...]
+    #   enum.to_a # => [#<Whisper::Segment>, ...]
    #
    def each_segment: { (Segment) -> void } -> void
                    | () -> Enumerator[Segment]
@ -74,25 +73,25 @@ module Whisper
    # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
    #
-    #     full_get_segment_t0(3) # => 1668 (16680 ms)
+    #   full_get_segment_t0(3) # => 1668 (16680 ms)
    #
    def full_get_segment_t0: (Integer) -> Integer
    # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
    #
-    #     full_get_segment_t1(3) # => 1668 (16680 ms)
+    #   full_get_segment_t1(3) # => 1668 (16680 ms)
    #
    def full_get_segment_t1: (Integer) -> Integer
    # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
    #
-    #     full_get_segment_speacker_turn_next(3) # => true
+    #   full_get_segment_speacker_turn_next(3) # => true
    #
    def full_get_segment_speaker_turn_next: (Integer) -> (true | false)
    # Text of a segment indexed by +segment_index+.
    #
-    #     full_get_segment_text(3) # => "ask not what your country can do for you, ..."
+    #   full_get_segment_text(3) # => "ask not what your country can do for you, ..."
    #
    def full_get_segment_text: (Integer) -> String
@ -116,9 +115,6 @@ module Whisper
    def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
                     | (Params, _Samples, ?Integer n_samples) -> self
                     | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
    def to_srt: () -> String
    def to_webvtt: () -> String
  end
  class Params
@ -154,10 +150,7 @@ module Whisper
      ?encoder_begin_callback: encoder_begin_callback,
      ?encoder_begin_callback_user_data: Object,
      ?abort_callback: abort_callback,
-      ?abort_callback_user_data: Object,
+      ?abort_callback_user_data: Object
      ?vad: boolish,
      ?vad_model_path: path | URI,
      ?vad_params: Whisper::VAD::Params
    ) -> instance
    # params.language = "auto" | "en", etc...
@ -285,9 +278,9 @@ module Whisper
    # Sets new segment callback, called for every newly generated text segment.
    #
-    #     params.new_segment_callback = ->(context, _, n_new, user_data) {
+    #   params.new_segment_callback = ->(context, _, n_new, user_data) {
-    #       # ...
+    #     # ...
-    #     }
+    #   }
    #
    def new_segment_callback=: (new_segment_callback) -> new_segment_callback
    def new_segment_callback: () -> (new_segment_callback | nil)
@ -300,9 +293,9 @@ module Whisper
    # Sets progress callback, called on each progress update.
    #
-    #     params.new_segment_callback = ->(context, _, progress, user_data) {
+    #   params.new_segment_callback = ->(context, _, progress, user_data) {
-    #       # ...
+    #     # ...
-    #     }
+    #   }
    #
    # +progress+ is an Integer between 0 and 100.
    #
@ -330,9 +323,9 @@ module Whisper
    # Sets abort callback, called to check if the process should be aborted.
    #
-    #     params.abort_callback = ->(user_data) {
+    #   params.abort_callback = ->(user_data) {
-    #       # ...
+    #     # ...
-    #     }
+    #   }
    #
    #
    def abort_callback=: (abort_callback) -> abort_callback
@ -345,25 +338,11 @@ module Whisper
    def abort_callback_user_data: () -> Object
    # Enable VAD
    #
    def vad=: (boolish) -> boolish
    def vad: () -> (true | false)
    # Path to the VAD model
    def vad_model_path=: (path | URI | nil) -> (path | URI | nil)
    def vad_model_path: () -> (String | nil)
    def vad_params=: (Whisper::VAD::Params) -> Whisper::VAD::Params
    def vad_params: () -> (Whisper::VAD::Params)
    # Hook called on new segment. Yields each Whisper::Segment.
    #
-    #     whisper.on_new_segment do |segment|
+    #   whisper.on_new_segment do |segment|
-    #       # ...
+    #     # ...
-    #     end
+    #   end
    #
    def on_new_segment: { (Segment) -> void } -> void
@ -377,20 +356,19 @@ module Whisper
    # Call block to determine whether abort or not. Return +true+ when you want to abort.
    #
-    #     params.abort_on do
+    #   params.abort_on do
-    #       if some_condition
+    #     if some_condition
-    #         true # abort
+    #       true # abort
-    #       else
+    #     else
-    #         false # continue
+    #       false # continue
    #       end
    #     end
    #   end
    #
    def abort_on: { (Object user_data) -> boolish } -> void
  end
  class Model
    def self.pre_converted_models: () -> Hash[String, Model::URI]
    def self.coreml_compiled_models: () -> Hash[Model::URI, Model::ZipURI]
    def self.new: () -> instance
    def n_vocab: () -> Integer
    def n_audio_ctx: () -> Integer
@ -410,22 +388,9 @@ module Whisper
      def to_path: -> String
      def clear_cache: -> void
    end
    class ZipURI < URI
      def cache: () -> Pathname
      def clear_cache: () -> void
    end
  end
  class Segment
    type deconstructed_keys = {
      start_time: (Integer | nil),
      end_time: (Integer | nil),
      text: (String | nil),
      no_speech_prob: (Float | nil),
      speaker_turn_next: (true | false | nil)
    }
    # Start time in milliseconds.
    #
    def start_time: () -> Integer
@ -435,70 +400,10 @@ module Whisper
    def end_time: () -> Integer
    # Whether the next segment is predicted as a speaker turn.
-    def speaker_turn_next?: () -> (true | false)
+    def speaker_next_turn?: () -> (true | false)
    def text: () -> String
    def no_speech_prob: () -> Float
    def to_srt_cue: () -> String
    def to_webvtt_cue: () -> String
    #  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
    #
    #      whisper.each_segment do |segment|
    #        segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
    #
    #        puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
    #      end
    def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
  end
  module VAD
    class Params
      def self.new: (
        ?threshold: Float,
        ?min_speech_duration_ms: Integer,
        ?min_silence_duration_ms: Integer,
        ?max_speech_duration_s: Float,
        ?speech_pad_ms: Integer,
        ?samples_overlap: Float
      ) -> instance
      # Probability threshold to consider as speech.
      #
      def threshold=: (Float) -> Float
      def threshold: () -> Float
      # Min duration for a valid speech segment.
      #
      def min_speech_duration_ms=: (Integer) -> Integer
      def min_speech_duration_ms: () -> Integer
      # Min silence duration to consider speech as ended.
      #
      def min_silence_duration_ms=: (Integer) -> Integer
      def min_silence_duration_ms: () -> Integer
      # Max duration of a speech segment before forcing a new segment.
      def max_speech_duration_s=: (Float) -> Float
      def max_speech_duration_s: () -> Float
      # Padding added before and after speech segments.
      #
      def speech_pad_ms=: (Integer) -> Integer
      def speech_pad_ms: () -> Integer
      # Overlap in seconds when copying audio samples from speech segment.
      #
      def samples_overlap=: (Float) -> Float
      def samples_overlap: () -> Float
      def ==: (Params) -> (true | false)
    end
  end
  class Error < StandardError
--- a/bindings/ruby/test/test_segment.rb
+++ b/bindings/ruby/test/test_segment.rb
@ -1,136 +0,0 @@
 require_relative "helper"
 class TestSegment < TestBase
  def test_iteration
    whisper.each_segment do |segment|
      assert_instance_of Whisper::Segment, segment
    end
  end
  def test_enumerator
    enum = whisper.each_segment
    assert_instance_of Enumerator, enum
    enum.to_a.each_with_index do |segment, index|
      assert_instance_of Whisper::Segment, segment
      assert_kind_of Integer, index
    end
  end
  def test_start_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal 0, segment.start_time if i == 0
      i += 1
    end
  end
  def test_end_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
      i += 1
    end
  end
  def test_no_speech_prob
    no_speech_prob = nil
    whisper.each_segment do |segment|
      no_speech_prob = segment.no_speech_prob
    end
    assert no_speech_prob > 0.0
  end
  def test_on_new_segment
    params = Whisper::Params.new
    seg = nil
    index = 0
    params.on_new_segment do |segment|
      assert_instance_of Whisper::Segment, segment
      if index == 0
        seg = segment
        assert_equal 0, segment.start_time
        assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
      end
      index += 1
    end
    whisper.transcribe(AUDIO, params)
    assert_equal 0, seg.start_time
    assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
  end
  def test_on_new_segment_twice
    params = Whisper::Params.new
    seg = nil
    params.on_new_segment do |segment|
      seg = segment
      return
    end
    params.on_new_segment do |segment|
      assert_same seg, segment
      return
    end
    whisper.transcribe(AUDIO, params)
  end
  def test_pattern_matching
    segment = whisper.each_segment.first
    segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
    assert_equal segment.start_time, start_time
    assert_equal segment.end_time, end_time
    assert_equal segment.text, text
    assert_equal segment.no_speech_prob, no_speech_prob
    assert_equal segment.speaker_turn_next?, speaker_turn_next
  end
  def test_pattern_matching_partial
    segment = whisper.each_segment.first
    segment => {start_time:, end_time:, text:}
    assert_equal segment.start_time, start_time
    assert_equal segment.end_time, end_time
    assert_equal segment.text, text
  end
  def test_deconstruct_keys
    segment = whisper.each_segment.first
    expected = {
      start_time: segment.start_time,
      end_time: segment.end_time,
      text: segment.text,
      no_speech_prob: segment.no_speech_prob,
      speaker_turn_next: segment.speaker_turn_next?
    }
    assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
  end
  def test_deconstruct_keys_non_existent
    omit "Undefined behavior"
    segment = whisper.each_segment.first
    assert_equal({}, segment.deconstruct_keys([:non_existent]))
  end
  def test_deconstruct_keys_too_many_keys
    omit "Undefined behavior"
    segment = whisper.each_segment.first
    assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
  end
  def test_deconstruct_keys_includes_non_existent_keys_not_too_many
    omit "Undefined behavior"
    segment = whisper.each_segment.first
    expected = {
      start_time: segment.start_time,
      end_time: segment.end_time,
      text: segment.text,
      no_speech_prob: segment.no_speech_prob
    }
    assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
  end
 end
--- a/bindings/ruby/test/test_vad.rb
+++ b/bindings/ruby/test/test_vad.rb
@ -1,19 +0,0 @@
 require_relative "helper"
 class TestVAD < TestBase
  def setup
    @whisper = Whisper::Context.new("base.en")
    vad_params = Whisper::VAD::Params.new
    @params = Whisper::Params.new(
      vad: true,
      vad_model_path: "silero-v5.1.2",
      vad_params:
    )
  end
  def test_transcribe
    @whisper.transcribe(TestBase::AUDIO, @params) do |text|
      assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
    end
  end
 end
--- a/bindings/ruby/test/test_vad_params.rb
+++ b/bindings/ruby/test/test_vad_params.rb
@ -1,103 +0,0 @@
 require_relative "helper"
 class TestVADParams < TestBase
  PARAM_NAMES = [
    :threshold,
    :min_speech_duration_ms,
    :min_silence_duration_ms,
    :max_speech_duration_s,
    :speech_pad_ms,
    :samples_overlap
  ]
  def setup
    @params = Whisper::VAD::Params.new
  end
  def test_new
    params = Whisper::VAD::Params.new
    assert_kind_of Whisper::VAD::Params, params
  end
  def test_threshold
    assert_in_delta @params.threshold, 0.5
    @params.threshold = 0.7
    assert_in_delta @params.threshold, 0.7
  end
  def test_min_speech_duration
    pend
  end
  def test_min_speech_duration_ms
    assert_equal 250, @params.min_speech_duration_ms
    @params.min_speech_duration_ms = 500
    assert_equal 500, @params.min_speech_duration_ms
  end
  def test_min_silence_duration_ms
    assert_equal 100, @params.min_silence_duration_ms
    @params.min_silence_duration_ms = 200
    assert_equal 200, @params.min_silence_duration_ms
  end
  def test_max_speech_duration
    pend
  end
  def test_max_speech_duration_s
    assert @params.max_speech_duration_s >= 10e37 # Defaults to FLT_MAX
    @params.max_speech_duration_s = 60.0
    assert_equal 60.0, @params.max_speech_duration_s
  end
  def test_speech_pad_ms
    assert_equal 30, @params.speech_pad_ms
    @params.speech_pad_ms = 50
    assert_equal 50, @params.speech_pad_ms
  end
  def test_samples_overlap
    assert_in_delta @params.samples_overlap, 0.1
    @params.samples_overlap = 0.5
    assert_in_delta @params.samples_overlap, 0.5
  end
  def test_equal
    assert_equal @params, Whisper::VAD::Params.new
  end
  def test_new_with_kw_args
    params = Whisper::VAD::Params.new(threshold: 0.7)
    assert_in_delta params.threshold, 0.7
    assert_equal 250, params.min_speech_duration_ms
  end
  def test_new_with_kw_args_non_existent
    assert_raise ArgumentError do
      Whisper::VAD::Params.new(non_existent: "value")
    end
  end
  data(PARAM_NAMES.collect {|param| [param, param]}.to_h)
  def test_new_with_kw_args_default_values(param)
    default_value = @params.send(param)
    value = default_value + 1
    params = Whisper::VAD::Params.new(param => value)
    if Float === value
      assert_in_delta value, params.send(param)
    else
      assert_equal value, params.send(param)
    end
    PARAM_NAMES.reject {|name| name == param}.each do |name|
      expected = @params.send(name)
      actual = params.send(name)
      if Float === expected
        assert_in_delta expected, actual
      else
        assert_equal expected, actual
      end
    end
  end
 end
--- a/bindings/ruby/tests/helper.rb
+++ b/bindings/ruby/tests/helper.rb
@ -3,7 +3,7 @@ require "whisper"
 require_relative "jfk_reader/jfk_reader"
 class TestBase < Test::Unit::TestCase
-  AUDIO = File.join(__dir__, "fixtures", "jfk.wav")
+  AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
  class << self
    def whisper
@ -21,4 +21,15 @@ class TestBase < Test::Unit::TestCase
  def whisper
    self.class.whisper
  end
  module BuildOptions
    load "ext/options.rb", self
    Options.include self
    def enable_config(name)
    end
    def arg_config(name)
    end
  end
 end
--- a/bindings/ruby/tests/jfk_reader/.gitignore
+++ b/bindings/ruby/tests/jfk_reader/.gitignore
--- a/bindings/ruby/tests/jfk_reader/extconf.rb
+++ b/bindings/ruby/tests/jfk_reader/extconf.rb
--- a/bindings/ruby/tests/jfk_reader/jfk_reader.c
+++ b/bindings/ruby/tests/jfk_reader/jfk_reader.c
--- a/bindings/ruby/tests/test_callback.rb
+++ b/bindings/ruby/tests/test_callback.rb
--- a/bindings/ruby/tests/test_error.rb
+++ b/bindings/ruby/tests/test_error.rb
--- a/bindings/ruby/tests/test_model.rb
+++ b/bindings/ruby/tests/test_model.rb
@ -106,13 +106,4 @@ class TestModel < TestBase
    assert_equal 1, model.ftype
    assert_equal "base", model.type
  end
  def test_coreml_model_auto_download
    uri = Whisper::Model.coreml_compiled_models[Whisper::Model.pre_converted_models["tiny"]]
    model_path = Pathname(uri.to_path).sub_ext("")
    model_path.rmtree if model_path.exist?
    uri.cache
    assert_path_exist model_path
  end
 end
--- a/bindings/ruby/tests/test_package.rb
+++ b/bindings/ruby/tests/test_package.rb
@ -18,24 +18,12 @@ class TestPackage < TestBase
    end
    def test_install
-      gemspec = Gem::Specification.load("whispercpp.gemspec")
+      match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
      filename = match_data[1]
      version = match_data[2]
      Dir.mktmpdir do |dir|
-        system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", exception: true
+        system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{filename.shellescape}", exception: true
-        assert_installed dir, gemspec.version
+        assert_installed dir, version
      end
    end
    def test_install_with_coreml
      omit_unless RUBY_PLATFORM.match?(/darwin/) do
        gemspec = Gem::Specification.load("whispercpp.gemspec")
        Dir.mktmpdir do |dir|
          system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", "--", "--enable-whisper-coreml", exception: true
          assert_installed dir, gemspec.version
          assert_nothing_raised do
            libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib")
            system "ruby", "-I", libdir, "-r", "whisper", "-e", "Whisper::Context.new('tiny')", exception: true
          end
        end
      end
    end
@ -47,4 +35,12 @@ class TestPackage < TestBase
      assert_path_not_exist File.join(dir, "gems/whispercpp-#{version}/ext/build")
    end
  end
  def test_build_options
    options = BuildOptions::Options.new
    assert_empty options.missing_options
    if ENV["TEST_EXTRA_OPTIONS"] == "1"
      assert_empty options.extra_options
    end
  end
 end
--- a/bindings/ruby/tests/test_params.rb
+++ b/bindings/ruby/tests/test_params.rb
@ -32,9 +32,6 @@ class TestParams < TestBase
    :progress_callback_user_data,
    :abort_callback,
    :abort_callback_user_data,
    :vad,
    :vad_model_path,
    :vad_params,
  ]
  def setup
@ -194,50 +191,6 @@ class TestParams < TestBase
    assert_in_delta 0.2, @params.no_speech_thold
  end
  def test_vad
    assert_false @params.vad
    @params.vad = true
    assert_true @params.vad
  end
  def test_vad_model_path
    assert_nil @params.vad_model_path
    @params.vad_model_path = "silero-v5.1.2"
    assert_equal Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path, @params.vad_model_path
  end
  def test_vad_model_path_with_nil
    @params.vad_model_path = "silero-v5.1.2"
    @params.vad_model_path = nil
    assert_nil @params.vad_model_path
  end
  def test_vad_model_path_with_invalid
    assert_raise TypeError do
      @params.vad_model_path = Object.new
    end
  end
  def test_vad_model_path_with_URI_string
    @params.vad_model_path = "https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin"
    assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
  end
  def test_vad_model_path_with_URI
    @params.vad_model_path = URI("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin")
    assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
  end
  def test_vad_params
    assert_kind_of Whisper::VAD::Params, @params.vad_params
    default_params = @params.vad_params
    assert_same default_params, @params.vad_params
    assert_equal 0.5, default_params.threshold
    new_params = Whisper::VAD::Params.new
    @params.vad_params = new_params
    assert_same new_params, @params.vad_params
  end
  def test_new_with_kw_args
    params = Whisper::Params.new(language: "es")
    assert_equal "es", params.language
@ -272,10 +225,6 @@ class TestParams < TestBase
              proc {}
            in [/_user_data\Z/, *]
              Object.new
            in [:vad_model_path, *]
              Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
            in [:vad_params, *]
              Whisper::VAD::Params.new
            end
    params = Whisper::Params.new(param => value)
    if Float === value
--- a/bindings/ruby/tests/test_segment.rb
+++ b/bindings/ruby/tests/test_segment.rb
@ -0,0 +1,74 @@
 require_relative "helper"
 class TestSegment < TestBase
  def test_iteration
    whisper.each_segment do |segment|
      assert_instance_of Whisper::Segment, segment
    end
  end
  def test_enumerator
    enum = whisper.each_segment
    assert_instance_of Enumerator, enum
    enum.to_a.each_with_index do |segment, index|
      assert_instance_of Whisper::Segment, segment
      assert_kind_of Integer, index
    end
  end
  def test_start_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal 0, segment.start_time if i == 0
      i += 1
    end
  end
  def test_end_time
    i = 0
    whisper.each_segment do |segment|
      assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
      i += 1
    end
  end
  def test_no_speech_prob
    no_speech_prob = nil
    whisper.each_segment do |segment|
      no_speech_prob = segment.no_speech_prob
    end
    assert no_speech_prob > 0.0
  end
  def test_on_new_segment
    params = Whisper::Params.new
    seg = nil
    index = 0
    params.on_new_segment do |segment|
      assert_instance_of Whisper::Segment, segment
      if index == 0
        seg = segment
        assert_equal 0, segment.start_time
        assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
      end
      index += 1
    end
    whisper.transcribe(AUDIO, params)
    assert_equal 0, seg.start_time
    assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
  end
  def test_on_new_segment_twice
    params = Whisper::Params.new
    seg = nil
    params.on_new_segment do |segment|
      seg = segment
      return
    end
    params.on_new_segment do |segment|
      assert_same seg, segment
      return
    end
    whisper.transcribe(AUDIO, params)
  end
 end
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -20,24 +20,6 @@ class TestWhisper < TestBase
    }
  end
  def test_transcribe_non_parallel
    @whisper = Whisper::Context.new("base.en")
    params  = Whisper::Params.new
    @whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
      assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
    }
  end
  def test_transcribe_n_processors
    @whisper = Whisper::Context.new("base.en")
    params  = Whisper::Params.new
    @whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
      assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
    }
  end
  sub_test_case "After transcription" do
    def test_full_n_segments
      assert_equal 1, whisper.full_n_segments
@ -112,10 +94,6 @@ class TestWhisper < TestBase
    end
  end
  def test_system_info_str
    assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
  end
  def test_log_set
    user_data = Object.new
    logs = []
@ -245,48 +223,4 @@ class TestWhisper < TestBase
      assert_match(/for your country/i, text)
    end
  end
  def test_to_srt
    whisper = Whisper::Context.new("base.en")
    whisper.transcribe AUDIO, @params
    lines = whisper.to_srt.lines
    assert_match(/\A\d+\n/, lines[0])
    assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
  end
  def test_to_webvtt
    whisper = Whisper::Context.new("base.en")
    whisper.transcribe AUDIO, @params
    lines = whisper.to_webvtt.lines
    assert_equal "WEBVTT\n", lines[0]
    assert_equal "\n", lines[1]
    assert_match(/\A\d+\n/, lines[2])
    assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
  end
  sub_test_case "Format needs escape" do
    def setup
      @whisper = Whisper::Context.new("base.en")
      @whisper.transcribe AUDIO, Whisper::Params.new
      segment = @whisper.each_segment.first
      segment.define_singleton_method :text do
        "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
      end
      @whisper.define_singleton_method :each_segment do
        Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
      end
    end
    def test_to_srt_escape
      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
    end
    def test_to_webvtt_escape
      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
    end
  end
 end
--- a/bindings/ruby/whispercpp.gemspec
+++ b/bindings/ruby/whispercpp.gemspec
@ -3,7 +3,8 @@ require_relative "extsources"
 Gem::Specification.new do |s|
  s.name    = "whispercpp"
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
-  s.version = '1.3.3'
+  s.version = '1.3.2'
  s.date    = '2025-05-11'
  s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
  s.email   = 'todd.fisher@gmail.com'
  s.extra_rdoc_files = ['LICENSE', 'README.md']
@ -20,7 +21,7 @@ Gem::Specification.new do |s|
              }
  s.summary = %q{Ruby whisper.cpp bindings}
-  s.test_files = s.files.select {|file| file.start_with? "test/"}
+  s.test_files = s.files.select {|file| file.start_with? "tests/"}
  s.extensions << 'ext/extconf.rb'
  s.required_ruby_version = '>= 3.1.0'
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -105,7 +105,6 @@ else()
    add_subdirectory(bench)
    add_subdirectory(server)
    add_subdirectory(quantize)
    add_subdirectory(vad-speech-segments)
    if (WHISPER_SDL2)
        add_subdirectory(stream)
        add_subdirectory(command)
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -17,7 +17,6 @@ const whisperParamsMock = {
  comma_in_time: false,
  translate: true,
  no_timestamps: false,
  detect_language: false,
  audio_ctx: 0,
  max_len: 0,
  prompt: "",
@ -31,9 +30,8 @@ const whisperParamsMock = {
 describe("Run whisper.node", () => {
    test("it should receive a non-empty value", async () => {
        let result = await whisperAsync(whisperParamsMock);
      console.log(result);
-        expect(result['transcription'].length).toBeGreaterThan(0);
+        expect(result.length).toBeGreaterThan(0);
    }, 10000);
 });
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -38,7 +38,6 @@ struct whisper_params {
    bool print_progress = false;
    bool no_timestamps  = false;
    bool no_prints      = false;
    bool detect_language= false;
    bool use_gpu        = true;
    bool flash_attn     = false;
    bool comma_in_time  = true;
@ -83,7 +82,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
            t1 = whisper_full_get_segment_t1(ctx, i);
        }
-        if (!params.no_timestamps && !params.no_prints) {
+        if (!params.no_timestamps) {
            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
        }
@ -114,14 +113,12 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
        // colorful print bug
        //
-        if (!params.no_prints) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
-            const char * text = whisper_full_get_segment_text(ctx, i);
+        printf("%s%s", speaker.c_str(), text);
            printf("%s%s", speaker.c_str(), text);
        }
        // with timestamps or speakers: each segment on new line
-        if ((!params.no_timestamps || params.diarize) && !params.no_prints) {
+        if (!params.no_timestamps || params.diarize) {
            printf("\n");
        }
@ -131,11 +128,6 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
 void cb_log_disable(enum ggml_log_level, const char *, void *) {}
 struct whisper_result {
    std::vector<std::vector<std::string>> segments;
    std::string language;
 };
 class ProgressWorker : public Napi::AsyncWorker {
 public:
    ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
@ -166,27 +158,15 @@ class ProgressWorker : public Napi::AsyncWorker {
    void OnOK() override {
        Napi::HandleScope scope(Env());
-
+        Napi::Object res = Napi::Array::New(Env(), result.size());
-        if (params.detect_language) {
+        for (uint64_t i = 0; i < result.size(); ++i) {
            Napi::Object resultObj = Napi::Object::New(Env());
            resultObj.Set("language", Napi::String::New(Env(), result.language));
            Callback().Call({Env().Null(), resultObj});
        }
        Napi::Object returnObj = Napi::Object::New(Env());
        if (!result.language.empty()) {
            returnObj.Set("language", Napi::String::New(Env(), result.language));
        }
        Napi::Array transcriptionArray = Napi::Array::New(Env(), result.segments.size());
        for (uint64_t i = 0; i < result.segments.size(); ++i) {
            Napi::Object tmp = Napi::Array::New(Env(), 3);
            for (uint64_t j = 0; j < 3; ++j) {
-                tmp[j] = Napi::String::New(Env(), result.segments[i][j]);
+                tmp[j] = Napi::String::New(Env(), result[i][j]);
            }
-            transcriptionArray[i] = tmp;
+            res[i] = tmp;
-         }
+        }
-         returnObj.Set("transcription", transcriptionArray);
+        Callback().Call({Env().Null(), res});
         Callback().Call({Env().Null(), returnObj});
    }
    // Progress callback function - using thread-safe function
@ -203,12 +183,12 @@ class ProgressWorker : public Napi::AsyncWorker {
 private:
    whisper_params params;
-    whisper_result result;
+    std::vector<std::vector<std::string>> result;
    Napi::Env env;
    Napi::ThreadSafeFunction tsfn;
    // Custom run function with progress callback support
-    int run_with_progress(whisper_params &params, whisper_result & result) {
+    int run_with_progress(whisper_params &params, std::vector<std::vector<std::string>> &result) {
        if (params.no_prints) {
            whisper_log_set(cb_log_disable, NULL);
        }
@ -297,8 +277,7 @@ class ProgressWorker : public Napi::AsyncWorker {
                wparams.print_timestamps = !params.no_timestamps;
                wparams.print_special    = params.print_special;
                wparams.translate        = params.translate;
-                wparams.language         = params.detect_language ? "auto" : params.language.c_str();
+                wparams.language         = params.language.c_str();
                wparams.detect_language  = params.detect_language;
                wparams.n_threads        = params.n_threads;
                wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
                wparams.offset_ms        = params.offset_t_ms;
@ -349,22 +328,18 @@ class ProgressWorker : public Napi::AsyncWorker {
                    return 10;
                }
            }
-        }
+    }
        if (params.detect_language || params.language == "auto") {
            result.language = whisper_lang_str(whisper_full_lang_id(ctx));
        }
        const int n_segments = whisper_full_n_segments(ctx);
-        result.segments.resize(n_segments);
+        result.resize(n_segments);
        for (int i = 0; i < n_segments; ++i) {
            const char * text = whisper_full_get_segment_text(ctx, i);
            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-            result.segments[i].emplace_back(to_timestamp(t0, params.comma_in_time));
+            result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
-            result.segments[i].emplace_back(to_timestamp(t1, params.comma_in_time));
+            result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
-            result.segments[i].emplace_back(text);
+            result[i].emplace_back(text);
        }
        whisper_print_timings(ctx);
@ -389,7 +364,6 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
  bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
  bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
  bool detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
  int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
  bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
  int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
@ -442,7 +416,6 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  params.max_context = max_context;
  params.print_progress = print_progress;
  params.prompt = prompt;
  params.detect_language = detect_language;
  Napi::Function callback = info[1].As<Napi::Function>();
  // Create a new Worker class with progress callback support
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -17,7 +17,6 @@ const whisperParams = {
  comma_in_time: false,
  translate: true,
  no_timestamps: false,
  detect_language: false,
  audio_ctx: 0,
  max_len: 0,
  progress_callback: (progress) => {
@ -32,8 +31,6 @@ const params = Object.fromEntries(
      const [key, value] = item.slice(2).split("=");
      if (key === "audio_ctx") {
        whisperParams[key] = parseInt(value);
      } else if (key === "detect_language") {
        whisperParams[key] = value === "true";
      } else {
        whisperParams[key] = value;
      }
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -66,12 +66,13 @@ static int whisper_bench_full(const whisper_params & params) {
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
    }
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 2;
@ -155,8 +156,6 @@ static int whisper_bench_full(const whisper_params & params) {
 }
 int main(int argc, char ** argv) {
    ggml_backend_load_all();
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@ -70,7 +70,6 @@ struct whisper_params {
    bool no_prints       = false;
    bool print_special   = false;
    bool print_colors    = false;
    bool print_confidence= false;
    bool print_progress  = false;
    bool no_timestamps   = false;
    bool log_score       = false;
@ -180,7 +179,6 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
        else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints       = true; }
        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
        else if (                  arg == "--print-confidence"){ params.print_confidence= true; }
        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
        else if (arg == "-l"    || arg == "--language")        { params.language        = whisper_param_turn_lowercase(ARGV_NEXT); }
@ -202,7 +200,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
        else if (                  arg == "--vad")                         { params.vad                         = true; }
        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = ARGV_NEXT; }
        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(ARGV_NEXT); }
-        else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vsd"  || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(ARGV_NEXT); }
        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(ARGV_NEXT); }
@ -259,7 +257,6 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
    fprintf(stderr, "  -np,       --no-prints         [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
    fprintf(stderr, "             --print-confidence  [%-7s] print confidence\n",                               params.print_confidence ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
@ -389,26 +386,6 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct
                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
            }
        } else if (params.print_confidence) {
            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
                if (params.print_special == false) {
                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                    if (id >= whisper_token_eot(ctx)) {
                        continue;
                    }
                }
                const char * text = whisper_full_get_token_text(ctx, i, j);
                const float  p    = whisper_full_get_token_p   (ctx, i, j);
                int style_idx = 2;     // High confidence - dim
                if (p < 0.33) {
                    style_idx = 0;     // Low confidence - inverse (highlighted)
                } else if (p < 0.66) {
                    style_idx = 1;     // Medium confidence - underlined
                }
                printf("%s%s%s%s", speaker.c_str(), k_styles[style_idx].c_str(), text, "\033[0m");
            }
        } else {
            const char * text = whisper_full_get_segment_text(ctx, i);
@ -909,8 +886,6 @@ static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const
 static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
 int main(int argc, char ** argv) {
    ggml_backend_load_all();
 #if defined(_WIN32)
    // Set the console output code page to UTF-8, while command line arguments
    // are still encoded in the system's code page. In this way, we can print
@ -990,6 +965,7 @@ int main(int argc, char ** argv) {
    }
    // whisper init
    struct whisper_context_params cparams = whisper_context_default_params();
    cparams.use_gpu    = params.use_gpu;
@ -1139,8 +1115,6 @@ int main(int argc, char ** argv) {
            if (params.print_colors) {
                fprintf(stderr, "%s: color scheme: red (low confidence), yellow (medium), green (high confidence)\n", __func__);
            } else if (params.print_confidence) {
                fprintf(stderr, "%s: confidence: highlighted (low confidence), underlined (medium), dim (high confidence)\n", __func__);
            }
            fprintf(stderr, "\n");
        }
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -251,7 +251,7 @@ static std::vector<std::string> get_words(const std::string &txt) {
 // command-list mode
 // guide the transcription to match the most likely command from a provided list
-static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::ofstream &fout) {
+static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: guided mode\n", __func__);
@ -444,16 +444,12 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio
                    const float prob = probs_id[0].first;
                    const int index = probs_id[0].second;
                    const char * best_command = allowed_commands[index].c_str();
                    fprintf(stdout, "\n");
                    fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
-                            "\033[1m", best_command, "\033[0m", prob,
+                            "\033[1m", allowed_commands[index].c_str(), "\033[0m", prob,
                            (int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
                    fprintf(stdout, "\n");
                    if (fout.is_open()) {
                        fout << best_command << std::endl;
                    }
                }
            }
@ -466,7 +462,7 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio
 // always-prompt mode
 // transcribe the voice into text after valid prompt
-static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
+static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running = true;
    bool ask_prompt = true;
@ -532,9 +528,6 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async
                if ((sim > 0.7f) && (command.size() > 0)) {
                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
                    if (fout.is_open()) {
                        fout << command << std::endl;
                    }
                }
                fprintf(stdout, "\n");
@ -549,7 +542,7 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async
 // general-purpose mode
 // freely transcribe the voice into text
-static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
+static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt  = true;
@ -669,10 +662,8 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
                    } else {
                        // cut the prompt from the decoded text
                        const std::string command = ::trim(txt.substr(best_len));
                        fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
                        if (fout.is_open()) {
                            fout << command << std::endl;
                        }
                    }
                    fprintf(stdout, "\n");
@ -687,8 +678,6 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
 }
 int main(int argc, char ** argv) {
    ggml_backend_load_all();
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
@ -709,10 +698,6 @@ int main(int argc, char ** argv) {
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 2;
    }
    // print some info about the processing
    {
@ -772,22 +757,13 @@ int main(int argc, char ** argv) {
        }
    }
    std::ofstream fout;
    if (params.fname_out.length() > 0) {
        fout.open(params.fname_out);
        if (!fout.is_open()) {
            fprintf(stderr, "%s: failed to open output file '%s'!\n", __func__, params.fname_out.c_str());
            return 1;
        }
    }
    if (ret_val == 0) {
        if (!params.commands.empty()) {
-            ret_val = process_command_list(ctx, audio, params, fout);
+            ret_val = process_command_list(ctx, audio, params);
        } else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
-            ret_val = always_prompt_transcription(ctx, audio, params, fout);
+            ret_val = always_prompt_transcription(ctx, audio, params);
        } else {
-            ret_val = process_general_transcription(ctx, audio, params, fout);
+            ret_val = process_general_transcription(ctx, audio, params);
        }
    }
--- a/examples/common-whisper.cpp
+++ b/examples/common-whisper.cpp
@ -112,20 +112,13 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
    }
    if (stereo) {
-        std::vector<float> stereo_data = pcmf32;
+		pcmf32s.resize(2);
-        pcmf32.resize(frame_count);
+		pcmf32s[0].resize(frame_count);
-
+		pcmf32s[1].resize(frame_count);
-        for (uint64_t i = 0; i < frame_count; i++) {
+		for (uint64_t i = 0; i < frame_count; i++) {
-            pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
+			pcmf32s[0][i] = pcmf32[2*i];
-        }
+			pcmf32s[1][i] = pcmf32[2*i + 1];
-
+		}
        pcmf32s.resize(2);
        pcmf32s[0].resize(frame_count);
        pcmf32s[1].resize(frame_count);
        for (uint64_t i = 0; i < frame_count; i++) {
            pcmf32s[0][i] = stereo_data[2*i];
            pcmf32s[1][i] = stereo_data[2*i + 1];
        }
    }
    ma_decoder_uninit(&decoder);
--- a/examples/common.h
+++ b/examples/common.h
@ -294,26 +294,6 @@ const std::vector<std::string> k_colors = {
    set_xterm256_foreground( 78, 178, 101),
 };
 // ANSI formatting codes
 static std::string set_inverse() {
    return "\033[7m";
 }
 static std::string set_underline() {
    return "\033[4m";
 }
 static std::string set_dim() {
    return "\033[2m";
 }
 // Style scheme for different confidence levels
 const std::vector<std::string> k_styles = {
    set_inverse(),   // Low confidence - inverse (highlighted)
    set_underline(), // Medium confidence - underlined
    set_dim(),       // High confidence - dim
 };
 //
 // Other utils
 //
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -424,8 +424,6 @@ static void process_loop(struct whisper_context * ctx, audio_async &audio, const
 }
 int main(int argc, char ** argv) {
    ggml_backend_load_all();
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -1,5 +1,4 @@
 #include "ggml.h"
 #include "ggml-backend.h"
 #include "common.h"
 #include "common-ggml.h"
@ -177,8 +176,6 @@ static bool whisper_model_quantize(const std::string & fname_inp, const std::str
 }
 int main(int argc, char ** argv) {
    ggml_backend_load_all();
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
        ggml_print_ftypes(stderr);
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,6 +1,3 @@
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(TARGET whisper-server)
 add_executable(${TARGET} server.cpp httplib.h)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -23,7 +23,6 @@ options:
  -sow,      --split-on-word     [false  ] split on word rather than on token
  -bo N,     --best-of N         [2      ] number of best candidates to keep
  -bs N,     --beam-size N       [-1     ] beam size for beam search
  -ac N,     --audio-ctx N       [0      ] audio context size (0 - all)
  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
@ -42,28 +41,9 @@ options:
             --prompt PROMPT     [       ] initial prompt
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
  -dtw MODEL --dtw MODEL         [       ] compute token-level timestamps
  --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
  --port PORT,                   [8080   ] Port number for the server
  --public PATH,                 [examples/server/public] Path to the public folder
  --request-path PATH,           [       ] Request path for all requests
  --inference-path PATH,         [/inference] Inference path for all requests
  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
  -sns,      --suppress-nst      [false  ] suppress non-speech tokens
  -nth N,    --no-speech-thold N [0.60   ] no speech threshold
  -nc,       --no-context        [false  ] do not use previous audio context
  -ng,       --no-gpu            [false  ] do not use gpu
  -fa,       --flash-attn        [false  ] flash attention
 Voice Activity Detection (VAD) options:
             --vad                           [false  ] enable Voice Activity Detection (VAD)
  -vm FNAME, --vad-model FNAME               [       ] VAD model path
  -vt N,     --vad-threshold N               [0.50   ] VAD threshold for speech recognition
  -vspd N,   --vad-min-speech-duration-ms  N [250    ] VAD min speech duration (0.0-1.0)
  -vsd N,    --vad-min-silence-duration-ms N [100    ] VAD min silence duration (to split segments)
  -vmsd N,   --vad-max-speech-duration-s   N [FLT_MAX] VAD max speech duration (auto-split longer)
  -vp N,     --vad-speech-pad-ms           N [30     ] VAD speech padding (extend segments)
  -vo N,     --vad-samples-overlap         N [0.10   ] VAD samples overlap (seconds between segments)
 ```
 > [!WARNING]
@ -87,35 +67,3 @@ curl 127.0.0.1:8080/load \
 -H "Content-Type: multipart/form-data" \
 -F model="<path-to-model-file>"
 ```
 ## Load testing with k6
 > **Note:** Install [k6](https://k6.io/docs/get-started/installation/) before running the benchmark script.
 You can benchmark the Whisper server using the provided bench.js script with [k6](https://k6.io/). This script sends concurrent multipart requests to the /inference endpoint and is fully configurable via environment variables.
 **Example usage:**
 ```
 k6 run bench.js \
  --env FILE_PATH=/absolute/path/to/samples/jfk.wav \
  --env BASE_URL=http://127.0.0.1:8080 \
  --env ENDPOINT=/inference \
  --env CONCURRENCY=4 \
  --env TEMPERATURE=0.0 \
  --env TEMPERATURE_INC=0.2 \
  --env RESPONSE_FORMAT=json
 ```
 **Environment variables:**
 - `FILE_PATH`: Path to the audio file to send (must be absolute or relative to the k6 working directory)
 - `BASE_URL`: Server base URL (default: `http://127.0.0.1:8080`)
 - `ENDPOINT`: API endpoint (default: `/inference`)
 - `CONCURRENCY`: Number of concurrent requests (default: 4)
 - `TEMPERATURE`: Decoding temperature (default: 0.0)
 - `TEMPERATURE_INC`: Temperature increment (default: 0.2)
 - `RESPONSE_FORMAT`: Response format (default: `json`)
 **Note:**
 - The server must be running and accessible at the specified `BASE_URL` and `ENDPOINT`.
 - The script is located in the same directory as this README: `bench.js`.
--- a/examples/server/bench.js
+++ b/examples/server/bench.js
@ -1,29 +0,0 @@
 import http from 'k6/http'
 import { check } from 'k6'
 export let options = {
  vus: parseInt(__ENV.CONCURRENCY) || 4,
  iterations: parseInt(__ENV.CONCURRENCY) || 4,
 }
 const filePath        = __ENV.FILE_PATH
 const baseURL         = __ENV.BASE_URL        || 'http://127.0.0.1:8080'
 const endpoint        = __ENV.ENDPOINT        || '/inference'
 const temperature     = __ENV.TEMPERATURE     || '0.0'
 const temperatureInc  = __ENV.TEMPERATURE_INC || '0.2'
 const responseFormat  = __ENV.RESPONSE_FORMAT || 'json'
 // Read the file ONCE at init time
 const fileBin = open(filePath, 'b')
 export default function () {
  const payload = {
    file:           http.file(fileBin, filePath),
    temperature:    temperature,
    temperature_inc: temperatureInc,
    response_format: responseFormat,
  }
  const res = http.post(`${baseURL}${endpoint}`, payload)
  check(res, { 'status is 200': r => r.status === 200 })
 } 
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -5,7 +5,6 @@
 #include "httplib.h"
 #include "json.hpp"
 #include <cfloat>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
@ -14,23 +13,10 @@
 #include <string>
 #include <thread>
 #include <vector>
 #include <memory>
 #include <csignal>
 #include <atomic>
 #include <functional>
 #include <cstdlib>
 #if defined (_WIN32)
 #include <windows.h>
 #endif
 using namespace httplib;
 using json = nlohmann::ordered_json;
 enum server_state {
    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
    SERVER_STATE_READY,          // Server is ready and model is loaded
 };
 namespace {
 // output formats
@ -40,20 +26,6 @@ const std::string srt_format    = "srt";
 const std::string vjson_format  = "verbose_json";
 const std::string vtt_format    = "vtt";
 std::function<void(int)> shutdown_handler;
 std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 inline void signal_handler(int signal) {
    if (is_terminating.test_and_set()) {
        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
        // this is for better developer experience, we can remove when the server is stable enough
        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
        exit(1);
    }
    shutdown_handler(signal);
 }
 struct server_params
 {
    std::string hostname = "127.0.0.1";
@ -118,16 +90,6 @@ struct whisper_params {
    std::string openvino_encode_device = "CPU";
    std::string dtw = "";
    // Voice Activity Detection (VAD) parameters
    bool        vad           = false;
    std::string vad_model     = "";
    float       vad_threshold = 0.5f;
    int         vad_min_speech_duration_ms = 250;
    int         vad_min_silence_duration_ms = 100;
    float       vad_max_speech_duration_s = FLT_MAX;
    int         vad_speech_pad_ms = 30;
    float       vad_samples_overlap = 0.1f;
 };
 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@ -177,19 +139,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -nth N,    --no-speech-thold N [%-7.2f] no speech threshold\n",   params.no_speech_thold);
    fprintf(stderr, "  -nc,       --no-context        [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
    fprintf(stderr, "  -ng,       --no-gpu            [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
    fprintf(stderr, "  -fa,       --flash-attn        [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
    // Voice Activity Detection (VAD) parameters
    fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
    fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
    fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
    fprintf(stderr, "  -vsd N,    --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n",      params.vad_min_silence_duration_ms);
    fprintf(stderr, "  -vmsd N,   --vad-max-speech-duration-s   N [%-7s] VAD max speech duration (auto-split longer)\n",      params.vad_max_speech_duration_s == FLT_MAX ?
                                                                                                                                  std::string("FLT_MAX").c_str() :
                                                                                                                                  std::to_string(params.vad_max_speech_duration_s).c_str());
    fprintf(stderr, "  -vp N,     --vad-speech-pad-ms           N [%-7d] VAD speech padding (extend segments)\n",             params.vad_speech_pad_ms);
    fprintf(stderr, "  -vo N,     --vad-samples-overlap         N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
    fprintf(stderr, "\n");
 }
@ -245,16 +194,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
        else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
        // Voice Activity Detection (VAD)
        else if (                  arg == "--vad")                         { params.vad                         = true; }
        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = argv[++i]; }
        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(argv[++i]); }
        else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(argv[++i]); }
        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(argv[++i]); }
        else if (arg == "-vo"   || arg == "--vad-samples-overlap")         { params.vad_samples_overlap         = std::stof(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params, sparams);
@ -571,41 +510,11 @@ void get_req_parameters(const Request & req, whisper_params & params)
    {
        params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
    }
    if (req.has_file("vad"))
    {
        params.vad = parse_str_to_bool(req.get_file_value("vad").content);
    }
    if (req.has_file("vad_threshold"))
    {
        params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
    }
    if (req.has_file("vad_min_speech_duration_ms"))
    {
        params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
    }
    if (req.has_file("vad_min_silence_duration_ms"))
    {
        params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
    }
    if (req.has_file("vad_max_speech_duration_s"))
    {
        params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
    }
    if (req.has_file("vad_speech_pad_ms"))
    {
        params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
    }
    if (req.has_file("vad_samples_overlap"))
    {
        params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
    }
 }
 }  // namespace
 int main(int argc, char ** argv) {
    ggml_backend_load_all();
    whisper_params params;
    server_params sparams;
@ -681,9 +590,6 @@ int main(int argc, char ** argv) {
        }
    }
    std::unique_ptr<httplib::Server> svr = std::make_unique<httplib::Server>();
    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
@ -693,10 +599,9 @@ int main(int argc, char ** argv) {
    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
    state.store(SERVER_STATE_READY);
-
+    Server svr;
-    svr->set_default_headers({{"Server", "whisper.cpp"},
+    svr.set_default_headers({{"Server", "whisper.cpp"},
                             {"Access-Control-Allow-Origin", "*"},
                             {"Access-Control-Allow-Headers", "content-type, authorization"}});
@ -775,15 +680,15 @@ int main(int argc, char ** argv) {
    whisper_params default_params = params;
    // this is only called if no index.html is found in the public --path
-    svr->Get(sparams.request_path + "/", [&](const Request &, Response &res){
+    svr.Get(sparams.request_path + "/", [&default_content](const Request &, Response &res){
        res.set_content(default_content, "text/html");
        return false;
    });
-    svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
+    svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
    });
-    svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
+    svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
        // acquire whisper model mutex lock
        std::lock_guard<std::mutex> lock(whisper_mutex);
@ -921,16 +826,6 @@ int main(int argc, char ** argv) {
            wparams.suppress_nst     = params.suppress_nst;
            wparams.vad              = params.vad;
            wparams.vad_model_path   = params.vad_model.c_str();
            wparams.vad_params.threshold               = params.vad_threshold;
            wparams.vad_params.min_speech_duration_ms  = params.vad_min_speech_duration_ms;
            wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
            wparams.vad_params.max_speech_duration_s   = params.vad_max_speech_duration_s;
            wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
            wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
            // this callback is called on each new segment
@ -1099,9 +994,8 @@ int main(int argc, char ** argv) {
        // reset params to their defaults
        params = default_params;
    });
-    svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
+    svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
        std::lock_guard<std::mutex> lock(whisper_mutex);
        state.store(SERVER_STATE_LOADING_MODEL);
        if (!req.has_file("model"))
        {
            fprintf(stderr, "error: no 'model' field in the request\n");
@ -1133,25 +1027,18 @@ int main(int argc, char ** argv) {
        // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
        whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
        state.store(SERVER_STATE_READY);
        const std::string success = "Load was successful!";
        res.set_content(success, "application/text");
        // check if the model is in the file system
    });
-    svr->Get(sparams.request_path + "/health", [&](const Request &, Response &res){
+    svr.Get(sparams.request_path + "/health", [&](const Request &, Response &res){
-        server_state current_state = state.load();
+        const std::string health_response = "{\"status\":\"ok\"}";
-        if (current_state == SERVER_STATE_READY) {
+        res.set_content(health_response, "application/json");
            const std::string health_response = "{\"status\":\"ok\"}";
            res.set_content(health_response, "application/json");
        } else {
            res.set_content("{\"status\":\"loading model\"}", "application/json");
            res.status = 503;
        }
    });
-    svr->set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
+    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
        const char fmt[] = "500 Internal Server Error\n%s";
        char buf[BUFSIZ];
        try {
@ -1165,7 +1052,7 @@ int main(int argc, char ** argv) {
        res.status = 500;
    });
-    svr->set_error_handler([](const Request &req, Response &res) {
+    svr.set_error_handler([](const Request &req, Response &res) {
        if (res.status == 400) {
            res.set_content("Invalid request", "text/plain");
        } else if (res.status != 500) {
@ -1175,10 +1062,10 @@ int main(int argc, char ** argv) {
    });
    // set timeouts and change hostname and port
-    svr->set_read_timeout(sparams.read_timeout);
+    svr.set_read_timeout(sparams.read_timeout);
-    svr->set_write_timeout(sparams.write_timeout);
+    svr.set_write_timeout(sparams.write_timeout);
-    if (!svr->bind_to_port(sparams.hostname, sparams.port))
+    if (!svr.bind_to_port(sparams.hostname, sparams.port))
    {
        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
                sparams.hostname.c_str(), sparams.port);
@ -1186,50 +1073,18 @@ int main(int argc, char ** argv) {
    }
    // Set the base directory for serving static files
-    svr->set_base_dir(sparams.public_path);
+    svr.set_base_dir(sparams.public_path);
    // to make it ctrl+clickable:
    printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);
-    shutdown_handler = [&](int signal) {
+    if (!svr.listen_after_bind())
-        printf("\nCaught signal %d, shutting down gracefully...\n", signal);
+    {
-        if (svr) {
+        return 1;
-            svr->stop();
+    }
        }
    };
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    whisper_print_timings(ctx);
-    struct sigaction sigint_action;
+    whisper_free(ctx);
    sigint_action.sa_handler = signal_handler;
    sigemptyset (&sigint_action.sa_mask);
    sigint_action.sa_flags = 0;
    sigaction(SIGINT, &sigint_action, NULL);
    sigaction(SIGTERM, &sigint_action, NULL);
 #elif defined (_WIN32)
    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
    };
    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
 #endif
    // clean up function, to be called before exit
    auto clean_up = [&]() {
        whisper_print_timings(ctx);
        whisper_free(ctx);
    };
    std::thread t([&] {
        if (!svr->listen_after_bind()) {
            fprintf(stderr, "error: server listen failed\n");
        }
    });
    svr->wait_until_ready();
    t.join();
    clean_up();
    return 0;
 }
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -116,8 +116,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
 }
 int main(int argc, char ** argv) {
    ggml_backend_load_all();
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
@ -163,10 +161,6 @@ int main(int argc, char ** argv) {
    cparams.flash_attn = params.flash_attn;
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 2;
    }
    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
    std::vector<float> pcmf32_old;
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -16,10 +16,7 @@ if (WHISPER_SDL2)
        llama-hparams.cpp
        llama-impl.cpp
        llama-io.cpp
-        llama-kv-cache-unified.cpp
+        llama-kv-cache.cpp
        llama-kv-cache-unified-iswa.cpp
        llama-memory-recurrent.cpp
        llama-memory-hybrid.cpp
        llama-memory.cpp
        llama-mmap.cpp
        llama-model-loader.cpp
--- a/examples/talk-llama/llama-arch.cpp
+++ b/examples/talk-llama/llama-arch.cpp
@ -20,7 +20,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_BERT,             "bert"             },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
    { LLM_ARCH_BLOOM,            "bloom"            },
    { LLM_ARCH_STABLELM,         "stablelm"         },
@ -73,8 +72,6 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
    { LLM_ARCH_DOTS1,            "dots1"            },
    { LLM_ARCH_ARCEE,            "arcee"            },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
@ -147,7 +144,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
    { LLM_KV_ATTENTION_LAYER_INDICES,                "%s.attention.layer_indices"                },
    { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
@ -178,8 +174,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
    { LLM_KV_CONVNEXT_BLOCK_COUNT,      "%s.convnext.block_count"      },
    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@ -198,13 +192,13 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
    { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
    { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
    { LLM_KV_TOKENIZER_ADD_SEP,              "tokenizer.ggml.add_sep_token"            },
    { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
    { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
    { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,      "tokenizer.chat_template.%s"              },
    { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
    { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
    { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
@ -248,24 +242,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
    {
        LLM_ARCH_ARCEE,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
            { LLM_TENSOR_OUTPUT,          "output" },
            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
    {
        LLM_ARCH_LLAMA4,
        {
@ -472,7 +448,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
            { LLM_TENSOR_POS_EMBD,        "position_embd" },
            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
@ -517,21 +492,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
    {
        LLM_ARCH_NEO_BERT,
        {
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
            { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
            { LLM_TENSOR_CLS,             "cls" },
            { LLM_TENSOR_CLS_OUT,         "cls.output" },
        },
    },
    {
        LLM_ARCH_JINA_BERT_V2,
        {
@ -1521,9 +1481,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
    {
@ -1593,34 +1550,6 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
    {
        LLM_ARCH_DOTS1,
        {
            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
            { LLM_TENSOR_OUTPUT,             "output" },
            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
        }
    },
    {
        LLM_ARCH_UNKNOWN,
        {
@ -1772,14 +1701,8 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}
 std::string LLM_KV::operator()(llm_kv kv) const {
-    std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+    return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
-
+        : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
    if (suffix != nullptr) {
        name += ".";
        name += suffix;
    }
    return name;
 }
 std::string LLM_TN_IMPL::str() const {
@ -1818,25 +1741,3 @@ llm_arch llm_arch_from_string(const std::string & name) {
 const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
    return LLM_TENSOR_INFOS.at(tensor);
 }
 bool llm_arch_is_recurrent(const llm_arch & arch) {
    switch (arch) {
        case LLM_ARCH_MAMBA:
        case LLM_ARCH_RWKV6:
        case LLM_ARCH_RWKV6QWEN2:
        case LLM_ARCH_RWKV7:
        case LLM_ARCH_ARWKV7:
            return true;
        default:
            return false;
    }
 }
 bool llm_arch_is_hybrid(const llm_arch & arch) {
    // TODO: There are currently no hybrid models! Once there are, this will be
    //  the place to identify them
    switch (arch) {
        default:
            return false;
    }
 }
--- a/examples/talk-llama/llama-arch.h
+++ b/examples/talk-llama/llama-arch.h
@ -24,7 +24,6 @@ enum llm_arch {
    LLM_ARCH_BERT,
    LLM_ARCH_NOMIC_BERT,
    LLM_ARCH_NOMIC_BERT_MOE,
    LLM_ARCH_NEO_BERT,
    LLM_ARCH_JINA_BERT_V2,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
@ -77,8 +76,6 @@ enum llm_arch {
    LLM_ARCH_WAVTOKENIZER_DEC,
    LLM_ARCH_PLM,
    LLM_ARCH_BAILINGMOE,
    LLM_ARCH_DOTS1,
    LLM_ARCH_ARCEE,
    LLM_ARCH_UNKNOWN,
 };
@ -151,7 +148,6 @@ enum llm_kv {
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
    LLM_KV_ATTENTION_LAYER_INDICES,
    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
@ -194,13 +190,13 @@ enum llm_kv {
    LLM_KV_TOKENIZER_MASK_ID,
    LLM_KV_TOKENIZER_ADD_BOS,
    LLM_KV_TOKENIZER_ADD_EOS,
    LLM_KV_TOKENIZER_ADD_SEP,
    LLM_KV_TOKENIZER_ADD_PREFIX,
    LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
    LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
    LLM_KV_TOKENIZER_HF_JSON,
    LLM_KV_TOKENIZER_RWKV,
    LLM_KV_TOKENIZER_CHAT_TEMPLATE,
    LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
    LLM_KV_TOKENIZER_FIM_PRE_ID,
    LLM_KV_TOKENIZER_FIM_SUF_ID,
    LLM_KV_TOKENIZER_FIM_MID_ID,
@ -217,8 +213,6 @@ enum llm_kv {
    LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
    LLM_KV_CONVNEXT_BLOCK_COUNT,
    LLM_KV_CLASSIFIER_OUTPUT_LABELS,
    // deprecated:
    LLM_KV_TOKENIZER_PREFIX_ID,
    LLM_KV_TOKENIZER_SUFFIX_ID,
@ -441,6 +435,3 @@ const char * llm_arch_name(llm_arch arch);
 llm_arch llm_arch_from_string(const std::string & name);
 const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
 bool llm_arch_is_recurrent(const llm_arch & arch);
 bool llm_arch_is_hybrid   (const llm_arch & arch);
--- a/examples/talk-llama/llama-batch.cpp
+++ b/examples/talk-llama/llama-batch.cpp
--- a/examples/talk-llama/llama-batch.h
+++ b/examples/talk-llama/llama-batch.h
@ -2,146 +2,88 @@
 #include "llama.h"
 #include "llama-cparams.h"
 #include <array>
 #include <vector>
 #include <set>
 #include <bitset>
 #include <unordered_map>
-// keep this struct lightweight
+// very similar to llama_batch,
-// it points to data in `llama_batch_allocr`
+// but has more metadata about sequences
 struct llama_ubatch {
    bool equal_seqs;
    // TODO: whole_seqs for embeddings?
-    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
-    uint32_t n_seq_tokens; // tokens per sequence set
+    uint32_t n_seq_tokens; // tokens per sequence
-    uint32_t n_seqs;       // sequence sets in the ubatch
+    uint32_t n_seqs;
    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch
-    // seq_id_unq: unique sequence ids in the ubatch
+    llama_token  *  token;    // [n_tokens]
-    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
+    float        *  embd;     // [n_embd, n_tokens]
-    //             used for extracting sequence pooled embeddings
+    llama_pos    *  pos;      // [n_tokens]
-
+    int32_t      *  n_seq_id; // [n_seqs]
-    //                          // size               | idx | val
+    llama_seq_id ** seq_id;   // [n_seqs]
-    llama_token  *  token;      // [n_tokens]         | i   | id, token
+    int8_t       *  output;   // [n_tokens]
    float        *  embd;       // [n_embd, n_tokens] | i   | embd
    llama_pos    *  pos;        // [n_tokens]         | i   | pos
    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
    int8_t       *  output;     // [n_tokens]         | i   | -
 };
-// a helper for sanitizing, fulfilling and splitting a batch
+struct llama_sbatch_seq {
-class llama_batch_allocr {
+    int32_t n_seq_id;
 public:
    llama_batch_allocr(uint32_t n_pos_per_embd);
-    // sanitize and auto-gen missing data in the input batch
+    llama_seq_id * seq_id;
    // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
    bool init(
            const llama_batch & batch_inp,
            const llama_vocab & vocab,
            const llama_memory_i * memory,
            uint32_t n_embd,
            bool output_all);
-    const llama_batch & get_batch() const;
+    size_t offset;
    size_t length;
 };
-    uint32_t get_n_tokens()  const;
+// sequence-length-aware batch splitting
-    uint32_t get_n_outputs() const;
+struct llama_sbatch {
    // tokens left in this batch
    size_t n_tokens;
-    // the array of output indices in the order they were encountered during the ubatch splitting
+    size_t n_embd;
    std::vector<int32_t> & get_out_ids();
-    // min/max positions of each sequence in the current ubatch
+    bool logits_all; // TODO: remove once lctx.logits_all is removed too
    llama_pos seq_pos_min(llama_seq_id seq_id) const;
    llama_pos seq_pos_max(llama_seq_id seq_id) const;
-    // call once before splitting the batch to reset the internal state
+    // sorted indices into the batch
-    void split_reset();
+    std::vector<int64_t> ids;
    // batch indices of the output
    std::vector<int64_t> out_ids;
    std::vector<llama_sbatch_seq> seq;
-    // simple split, unknown number of sequence sets of unequal lengths
+    const llama_batch * batch = nullptr;
    llama_ubatch split_simple(uint32_t n_ubatch);
-    // make ubatches of equal-length sequences sets
+    // buffers for the ubatch
-    llama_ubatch split_equal(uint32_t n_ubatch);
+    std::vector<llama_token>    ubatch_token;
    std::vector<float>          ubatch_embd;
    std::vector<llama_pos>      ubatch_pos;
    std::vector<int32_t>        ubatch_n_seq_id;
    std::vector<llama_seq_id *> ubatch_seq_id;
    std::vector<int8_t>         ubatch_output;
-    // sequence-set-wise split - each ubatch contains a single sequence-set
+    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
    llama_ubatch split_seq(uint32_t n_ubatch);
-    // a helper method for creating a well-defined ubatch of tokens
+    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length);
    // TODO: support embeddings if needed in the future
    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);
-private:
+    // simple split, unknown number of sequences of unequal lengths
-    void clear();
+    llama_ubatch split_simple(size_t n_ubatch);
-    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
+    // make batches of equal-length sequences
-    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
+    llama_ubatch split_equal(size_t n_ubatch);
    llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);
-    // for debugging, start with LLAMA_BATCH_DEBUG=2
+    // sequence-wise split
-    void ubatch_print(const llama_ubatch & ubatch, int debug);
+    llama_ubatch split_seq(size_t n_ubatch);
-    llama_batch batch;
+    llama_sbatch() = default;
    llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
 };
-    // only for debugging purposes
+// temporary allocate memory for the input batch if needed
-    const llama_vocab * vocab;
+struct llama_batch_allocr {
-
+    struct llama_batch batch;
    // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
    //       ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
    const uint32_t n_pos_per_embd;
    uint32_t n_embd;
    uint32_t n_outputs;
    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
    std::vector<llama_pos>      pos;
    std::vector<int32_t>        n_seq_id;
    std::vector<llama_seq_id *> seq_id;
-    std::vector<llama_seq_id>   seq_id_unq;
+    std::vector<int8_t>         logits;
    std::vector<int32_t>        seq_idx;
    std::vector<int8_t>         output;
-    using pos_set_t = std::set<llama_pos>;
+    // optionally fulfill the batch returned by llama_batch_get_one
-    using seq_cpl_t = std::vector<bool>;
+    llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
    std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
    std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
    using idx_vec_t = std::vector<int32_t>;
    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
    // batch indices of the output
    std::vector<int32_t> out_ids;
    // used[i] indicates if token i has already been used in a previous ubatch
    std::vector<bool> used;
    // llama_ubatch points to this data:
    struct ubatch {
        std::vector<llama_token>    token;
        std::vector<float>          embd;
        std::vector<llama_pos>      pos;
        std::vector<int32_t>        n_seq_id;
        std::vector<llama_seq_id *> seq_id;
        std::vector<llama_seq_id>   seq_id_unq;
        std::vector<int32_t>        seq_idx;
        std::vector<int8_t>         output;
    };
    // current splitting state:
    std::vector<ubatch> ubatches;
    int debug;
 };
--- a/examples/talk-llama/llama-chat.cpp
+++ b/examples/talk-llama/llama-chat.cpp
@ -183,8 +183,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_BAILING;
    } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
        return LLM_CHAT_TEMPLATE_LLAMA4;
    } else if (tmpl_contains("<|endofuserprompt|>")) {
        return LLM_CHAT_TEMPLATE_DOTS1;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@ -333,7 +331,7 @@ int32_t llm_chat_apply_template(
            std::string role(message->role);
            if (role == "system") {
                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
-                system_prompt += trim(message->content);
+                system_prompt = trim(message->content);
                continue;
            }
            // in gemma, "assistant" is "model"
@ -355,7 +353,7 @@ int32_t llm_chat_apply_template(
            std::string role(message->role);
            if (role == "system") {
                // there is no system message support, we will merge it with user prompt
-                system_prompt += message->content;
+                system_prompt = message->content;
                continue;
            } else if (role == "user") {
                ss << "Human: ";
@ -645,21 +643,6 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "Assistant:";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
        // dots.llm1.inst (DOTS1)
        for (auto message : chat) {
            std::string role(message->role);
            if (role == "system") {
                ss << "<|system|>" << message->content << "<|endofsystem|>";
            } else if (role == "user") {
                ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
            } else {
                ss << "<|response|>" << message->content << "<|endofresponse|>";
            }
        }
        if (add_ass) {
            ss << "<|response|>";
        }
    } else {
        // template not supported
        return -1;
--- a/examples/talk-llama/llama-chat.h
+++ b/examples/talk-llama/llama-chat.h
@ -43,7 +43,6 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_BAILING,
    LLM_CHAT_TEMPLATE_LLAMA4,
    LLM_CHAT_TEMPLATE_SMOLVLM,
    LLM_CHAT_TEMPLATE_DOTS1,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };
--- a/examples/talk-llama/llama-context.cpp
+++ b/examples/talk-llama/llama-context.cpp
--- a/examples/talk-llama/llama-context.h
+++ b/examples/talk-llama/llama-context.h
@ -1,6 +1,7 @@
 #pragma once
 #include "llama.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
 #include "llama-adapter.h"
@ -12,14 +13,11 @@
 #include <vector>
 struct llama_model;
-class llama_batch_allocr;
+struct llama_kv_cache;
 class llama_io_read_i;
 class llama_io_write_i;
 struct llama_memory_i;
 struct llama_memory_state_i;
 struct llama_context {
    // init scheduler and compute buffers, reserve worst-case graphs
    llama_context(
@ -46,12 +44,10 @@ struct llama_context {
    uint32_t n_threads()       const;
    uint32_t n_threads_batch() const;
-    llama_memory_t get_memory() const;
+          llama_kv_cache * get_kv_self();
    const llama_kv_cache * get_kv_self() const;
-    // return true of the KV cache was updated
+    void kv_self_update();
    // TODO: remove
    bool kv_self_update(bool optimize);
    void kv_self_defrag_sched();
    enum llama_pooling_type pooling_type() const;
@ -92,18 +88,8 @@ struct llama_context {
                int32_t   il_start,
                int32_t   il_end);
-    // process a single ubatch with a specific graph type
+    int encode(llama_batch & inp_batch);
-    // if memory_state is provided, it will be applied first to the context's memory
+    int decode(llama_batch & inp_batch);
    // ret contains the status of the graph computation
    // returns nullptr only if ret != GGML_STATUS_SUCCESS
    llm_graph_result_ptr process_ubatch(
              const llama_ubatch & ubatch,
                  llm_graph_type   gtype,
            llama_memory_state_i * mstate,
                     ggml_status & ret);
    int encode(const llama_batch & batch_inp);
    int decode(const llama_batch & batch_inp);
    //
    // state save/load
@ -181,7 +167,7 @@ private:
    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    uint32_t output_reserve(int32_t n_outputs);
+    int32_t output_reserve(int32_t n_outputs);
    //
    // graph
@ -194,18 +180,16 @@ public:
    ggml_cgraph * graph_init();
    // returns the result of ggml_backend_sched_graph_compute_async execution
-    ggml_status graph_compute(ggml_cgraph * gf, bool batched);
+    ggml_status graph_compute(
-
+            ggml_cgraph * gf,
-    // reserve a graph with a dummy ubatch of the specified size
+                   bool   batched);
    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);
 private:
    llm_graph_result_ptr graph_build(
-                    ggml_context * ctx,
+            ggml_context * ctx,
-                     ggml_cgraph * gf,
+             ggml_cgraph * gf,
-              const llama_ubatch & ubatch,
+      const llama_ubatch & ubatch,
-                  llm_graph_type   gtype,
+          llm_graph_type   gtype);
      const llama_memory_state_i * mstate);
    llm_graph_cb graph_get_cb() const;
@ -230,9 +214,6 @@ private:
    std::unique_ptr<llama_memory_i> memory;
    // TODO: temporary, until the llama_kv_self_defrag() API is removed
    bool memory_force_optimize = false;
    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * logits      = nullptr;
@ -246,10 +227,8 @@ private:
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
-    // reuse the batch_allocr to avoid unnecessary memory allocations
+    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
-    std::unique_ptr<llama_batch_allocr> balloc;
+    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
    uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
--- a/examples/talk-llama/llama-cparams.cpp
+++ b/examples/talk-llama/llama-cparams.cpp
@ -1,5 +1 @@
 #include "llama-cparams.h"
 size_t llama_max_parallel_sequences(void) {
    return LLAMA_MAX_SEQ;
 }
--- a/examples/talk-llama/llama-cparams.h
+++ b/examples/talk-llama/llama-cparams.h
@ -4,8 +4,6 @@
 #include <cstdint>
 #define LLAMA_MAX_SEQ 64
 struct llama_cparams {
    uint32_t n_ctx;           // context size used during inference
    uint32_t n_batch;
--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
@ -1177,18 +1177,8 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
-                    // get from the first matched capturing group to the end of the string
+                    // get from the first match to the end of the string
-                    size_t start = std::string::npos;
+                    auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
                    for (auto i = 1u; i < match.size(); i++) {
                        if (match.length(i) > 0) {
                            start = match.position(i);
                            break;
                        }
                    }
                    if (start == std::string::npos) {
                        start = match.position(0);
                    }
                    auto constrained_str = grammar.trigger_buffer.substr(start);
                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                    grammar.trigger_buffer.clear();
                    llama_grammar_accept_str(grammar, constrained_str);
--- a/examples/talk-llama/llama-graph.cpp
+++ b/examples/talk-llama/llama-graph.cpp
--- a/examples/talk-llama/llama-graph.h
+++ b/examples/talk-llama/llama-graph.h
@ -17,12 +17,9 @@ struct ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;
-struct llama_memory_state_i;
+class llama_memory_i;
-
+class llama_kv_cache_unified;
-class llama_kv_cache_unified_state;
+class llama_kv_cache_recurrent;
 class llama_kv_cache_unified_iswa_state;
 class llama_memory_recurrent_state;
 class llama_memory_hybrid_state;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@ -37,7 +34,6 @@ enum llm_ffn_op_type {
    LLM_FFN_RELU,
    LLM_FFN_RELU_SQR,
    LLM_FFN_SWIGLU,
    LLM_FFN_GEGLU,
 };
 enum llm_ffn_gate_type {
@ -95,14 +91,14 @@ public:
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
+    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
    virtual ~llm_graph_input_pos() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * pos = nullptr; // I32 [n_batch]
-    const uint32_t n_pos_per_embd = 1;
+    const int64_t n_pos_per_embd = 1;
 };
 // temperature tuning, used by llama4
@ -136,7 +132,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket_kv(
            const llama_hparams & hparams,
-            const llama_kv_cache_unified_state * kv_state) : hparams(hparams), kv_state(kv_state) {}
+            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
    virtual ~llm_graph_input_pos_bucket_kv() = default;
    void set_input(const llama_ubatch * ubatch) override;
@ -144,7 +140,7 @@ public:
    ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
    const llama_hparams & hparams;
-    const llama_kv_cache_unified_state * kv_state;
+    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_out_ids : public llm_graph_input_i {
@ -189,16 +185,28 @@ public:
    const llama_cparams & cparams;
 };
-class llm_graph_input_rs : public llm_graph_input_i {
+class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
-    llm_graph_input_rs(const llama_memory_recurrent_state * mem_state) : mem_state(mem_state) {}
+    llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_rs() = default;
+    virtual ~llm_graph_input_s_copy() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_copy; // I32 [kv_size]
-    const llama_memory_recurrent_state * mem_state;
+    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
    llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_mask() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_mask; // F32 [1, n_kv]
    const llama_kv_cache_recurrent * kv_self;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
@ -238,40 +246,15 @@ public:
    llm_graph_input_attn_kv_unified(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
-            const llama_kv_cache_unified_state * kv_state) :
+            const llama_kv_cache_unified * kv_self) :
        hparams(hparams),
        cparams(cparams),
-        kv_state(kv_state) {
+        kv_self(kv_self) {
    }
    ~llm_graph_input_attn_kv_unified() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_kv_cache_unified_state * kv_state;
 };
 class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
 public:
    llm_graph_input_attn_kv_unified_iswa(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
            const llama_kv_cache_unified_iswa_state * kv_state) :
        hparams(hparams),
        cparams(cparams),
        kv_state(kv_state) {
    }
    ~llm_graph_input_attn_kv_unified_iswa() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
@ -283,7 +266,7 @@ public:
    const llama_hparams & hparams;
    const llama_cparams & cparams;
-    const llama_kv_cache_unified_iswa_state * kv_state;
+    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_attn_cross : public llm_graph_input_i {
@ -301,33 +284,6 @@ public:
    const llama_cross * cross = nullptr;
 };
 class llm_graph_input_mem_hybrid : public llm_graph_input_i {
 public:
    llm_graph_input_mem_hybrid(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
            const llama_memory_hybrid_state * mem_state) :
        hparams(hparams),
        cparams(cparams),
        mem_state(mem_state) {
    }
    virtual ~llm_graph_input_mem_hybrid() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_copy; // I32 [kv_size]
    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_memory_hybrid_state * mem_state;
 };
 //
 // llm_graph_result
 //
@ -401,12 +357,12 @@ struct llm_graph_params {
    ggml_backend_sched_t sched;
    ggml_backend_t backend_cpu;
-    const llama_adapter_cvec   * cvec;
+    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras  * loras;
+    const llama_adapter_loras * loras;
-    const llama_memory_state_i * mstate;
+    const llama_memory_i      * memory;
-    const llama_cross          * cross;
+    const llama_cross         * cross;
-    uint32_t n_outputs;
+    int32_t n_outputs;
    const llm_graph_cb & cb;
 };
@ -422,6 +378,7 @@ struct llm_graph_context {
    const int64_t n_layer;
    const int64_t n_rot;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
    const int64_t n_ctx_per_seq;
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
@ -440,8 +397,8 @@ struct llm_graph_context {
    const float norm_eps;
    const float norm_rms_eps;
-    const int64_t n_tokens;
+    const int32_t n_tokens;
-    const int64_t n_outputs;
+    const int32_t n_outputs;
    const int32_t n_ctx_orig; // yarn
    const enum llama_pooling_type pooling_type;
@ -453,10 +410,10 @@ struct llm_graph_context {
    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
-    const llama_adapter_cvec   * cvec;
+    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras  * loras;
+    const llama_adapter_loras * loras;
-    const llama_memory_state_i * mstate;
+    const llama_memory_i      * memory;
-    const llama_cross          * cross;
+    const llama_cross         * cross;
    const llm_graph_cb & cb_func;
@ -464,6 +421,8 @@ struct llm_graph_context {
    llm_graph_context(const llm_graph_params & params);
    int64_t n_pos_per_embd() const;
    void cb(ggml_tensor * cur, const char * name, int il) const;
    //
@ -534,26 +493,27 @@ struct llm_graph_context {
    ggml_tensor * build_inp_out_ids() const;
    ggml_tensor * build_inp_mean() const;
    ggml_tensor * build_inp_cls() const;
    ggml_tensor * build_inp_s_copy() const;
    ggml_tensor * build_inp_s_mask() const;
    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
    ggml_tensor * build_inp_pos_bucket_dec() const;
    ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
    //
    // attention
    //
    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
-             ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
+             ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
-             ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
+             ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
-             ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
+             ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
-             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                    bool   v_trans,
                   float   kq_scale) const;
    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@ -586,21 +546,6 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;
    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_kv_unified_iswa * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,
                    int   il) const;
    llm_graph_input_attn_cross * build_attn_inp_cross() const;
    ggml_tensor * build_attn(
@ -616,62 +561,23 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;
    ggml_tensor * build_attn(
            llm_graph_input_mem_hybrid * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
            ggml_tensor * kq_b,
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,
                    int   il) const;
    //
    // recurrent
    //
-    // TODO: avoid notion of "kv"
+    ggml_tensor * build_copy_mask_state(
-    // TODO: move this implementation to llama_memory_recurrent.
+             ggml_cgraph * gf,
-    //       this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
+             ggml_tensor * s,
-    //       when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
+             ggml_tensor * state_copy,
-    //         implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
+             ggml_tensor * state_mask,
-    //         `llama_memory_recurrent`
+                 int32_t   n_state,
-    ggml_tensor * build_rs(
+                 int32_t   n_seqs) const;
            ggml_cgraph * gf,
            ggml_tensor * s,
            ggml_tensor * state_copy,
                int32_t   state_size,
                int32_t   n_seqs,
               uint32_t   n_kv,
               uint32_t   kv_head,
               uint32_t   kv_size,
                int32_t   rs_zero,
                   bool   avoid_copies = false) const;
    llm_graph_input_rs * build_rs_inp() const;
    ggml_tensor * build_rs(
            llm_graph_input_rs * inp,
            ggml_cgraph * gf,
            ggml_tensor * s,
                int32_t   state_size,
                int32_t   n_seqs,
                   bool   avoid_copies = false) const;
    ggml_tensor * build_rs(
            llm_graph_input_mem_hybrid * inp,
            ggml_cgraph * gf,
            ggml_tensor * s,
                int32_t   state_size,
                int32_t   n_seqs,
                   bool   avoid_copies = false) const;
    ggml_tensor * build_rwkv_token_shift_load(
-        llm_graph_input_rs * inp,
+             ggml_cgraph * gf,
-               ggml_cgraph * gf,
+             ggml_tensor * state_copy,
-        const llama_ubatch & ubatch,
+             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
                     int   il) const;
    ggml_tensor * build_rwkv_token_shift_store(
@ -690,6 +596,3 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
 };
 // TODO: better name
 int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
--- a/examples/talk-llama/llama-hparams.cpp
+++ b/examples/talk-llama/llama-hparams.cpp
@ -2,22 +2,6 @@
 #include "ggml.h"
 void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
    for (uint32_t il = 0; il < n_layer; ++il) {
        swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
    }
 }
 bool llama_hparams::is_swa_any() const {
    for (uint32_t il = 0; il < n_layer; ++il) {
        if (swa_layers[il]) {
            return true;
        }
    }
    return false;
 }
 uint32_t llama_hparams::n_head(uint32_t il) const {
    if (il < n_layer) {
        return n_head_arr[il];
@ -65,7 +49,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
    return n_embd_head_v * n_head_kv;
 }
-uint32_t llama_hparams::n_embd_r() const {
+uint32_t llama_hparams::n_embd_k_s() const {
    if (wkv_head_size != 0) {
        // for RWKV models
        return token_shift_count * n_embd;
@ -76,7 +60,7 @@ uint32_t llama_hparams::n_embd_r() const {
    return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
 }
-uint32_t llama_hparams::n_embd_s() const {
+uint32_t llama_hparams::n_embd_v_s() const {
    if (wkv_head_size != 0) {
        // corresponds to RWKV's wkv_states size
        return n_embd * wkv_head_size;
@ -86,17 +70,9 @@ uint32_t llama_hparams::n_embd_s() const {
    return ssm_d_state * ssm_d_inner;
 }
 bool llama_hparams::is_recurrent(uint32_t il) const {
    return recurrent_layer_arr[il];
 }
 uint32_t llama_hparams::n_pos_per_embd() const {
    return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
 }
 bool llama_hparams::is_swa(uint32_t il) const {
    if (il < n_layer) {
-        return swa_layers[il];
+        return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
    }
    GGML_ABORT("fatal error");
--- a/examples/talk-llama/llama-hparams.h
+++ b/examples/talk-llama/llama-hparams.h
@ -14,12 +14,6 @@ enum llama_expert_gating_func_type {
    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
 };
 enum llama_swa_type {
    LLAMA_SWA_TYPE_NONE     = 0,
    LLAMA_SWA_TYPE_STANDARD = 1,
    LLAMA_SWA_TYPE_CHUNKED  = 2,
 };
 struct llama_hparams_posnet {
    uint32_t n_embd;
    uint32_t n_layer;
@ -41,6 +35,8 @@ struct llama_hparams {
    uint32_t n_embd_features = 0;
    uint32_t n_layer;
    uint32_t n_rot;
    uint32_t n_swa = 0; // sliding window attention (SWA)
    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
    uint32_t n_expert = 0;
@ -100,24 +96,12 @@ struct llama_hparams {
    std::array<int, 4> rope_sections;
    // Sliding Window Attention (SWA)
    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    // the size of the sliding window (0 - no SWA)
    uint32_t n_swa = 0;
    // if swa_layers[il] == true, then layer il is SWA
    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
    // by default, all layers are dense
    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
    // for State Space Models
    uint32_t ssm_d_conv  = 0;
    uint32_t ssm_d_inner = 0;
    uint32_t ssm_d_state = 0;
    uint32_t ssm_dt_rank = 0;
    // for hybrid state space models
    std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
    bool ssm_dt_b_c_rms = false;
    float f_clamp_kqv      = 0.0f;
@ -132,13 +116,11 @@ struct llama_hparams {
    bool causal_attn   = true;
    bool use_alibi     = false;
    bool attn_soft_cap = false;
    bool use_kq_norm   = true;
    // for Classifiers
    uint32_t n_cls_out = 1;
    // llama4
    uint32_t n_moe_layer_step        = 0;
    bool     use_kq_norm             = true;
    uint32_t n_attn_chunk            = 0;
    // values below seems to be fixed on llama4
    uint32_t n_no_rope_layer_step    = 4;
    uint32_t n_attn_temp_floor_scale = 8192;
    float    f_attn_temp_scale       = 0.1;
@ -151,23 +133,6 @@ struct llama_hparams {
    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
    enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
    // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
    // note that if n_pattern == 0, all layers are SWA
    //           if n_pattern == 1, all layers are dense
    // example: n_pattern = 3
    //   il == 0: swa
    //   il == 1: swa
    //   il == 2: dense
    //   il == 3: swa
    //   il == 4: swa
    //   il == 5: dense
    //   il == 6: swa
    //   etc ...
    void set_swa_pattern(uint32_t n_pattern);
    // return true if one of the layers is SWA
    bool is_swa_any() const;
    uint32_t n_head(uint32_t il = 0) const;
    uint32_t n_head_kv(uint32_t il = 0) const;
@ -184,15 +149,10 @@ struct llama_hparams {
    // dimension of the rolling state embeddings
    // corresponds to Mamba's conv_states size or RWKV's token_shift states size
-    uint32_t n_embd_r() const;
+    uint32_t n_embd_k_s() const;
    // dimension of the recurrent state embeddings
-    uint32_t n_embd_s() const;
+    uint32_t n_embd_v_s() const;
    // whether or not the given layer is recurrent (for hybrid models)
    bool is_recurrent(uint32_t il) const;
    uint32_t n_pos_per_embd() const;
    bool is_swa(uint32_t il) const;
 };
--- a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp
+++ b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp
@ -1,279 +0,0 @@
 #include "llama-kv-cache-unified-iswa.h"
 #include "llama-impl.h"
 #include "llama-batch.h"
 #include "llama-model.h"
 #include <algorithm>
 #include <cassert>
 //
 // llama_kv_cache_unified_iswa
 //
 llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
        const llama_model & model,
                ggml_type   type_k,
                ggml_type   type_v,
                     bool   v_trans,
                     bool   offload,
                     bool   swa_full,
                 uint32_t   kv_size,
                 uint32_t   n_seq_max,
                 uint32_t   n_ubatch,
                 uint32_t   n_pad) : hparams(model.hparams) {
    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
    const uint32_t size_base = kv_size;
    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
    if (swa_full) {
        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
        size_swa = size_base;
    }
    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
    kv_base = std::make_unique<llama_kv_cache_unified>(
            model, std::move(filter_base), type_k, type_v,
            v_trans, offload, size_base, n_seq_max, n_pad,
            0, LLAMA_SWA_TYPE_NONE);
    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
    kv_swa = std::make_unique<llama_kv_cache_unified>(
            model, std::move(filter_swa), type_k, type_v,
            v_trans, offload, size_swa, n_seq_max, n_pad,
            hparams.n_swa, hparams.swa_type);
 }
 void llama_kv_cache_unified_iswa::clear(bool data) {
    kv_base->clear(data);
    kv_swa ->clear(data);
 }
 bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
    bool res = true;
    res = res & kv_base->seq_rm(seq_id, p0, p1);
    res = res & kv_swa ->seq_rm(seq_id, p0, p1);
    return res;
 }
 void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
    kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
    kv_base->seq_keep(seq_id);
    kv_swa ->seq_keep(seq_id);
 }
 void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
    kv_base->seq_add(seq_id, p0, p1, shift);
    kv_swa ->seq_add(seq_id, p0, p1, shift);
 }
 void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
    kv_base->seq_div(seq_id, p0, p1, d);
    kv_swa ->seq_div(seq_id, p0, p1, d);
 }
 llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
    // the base cache is a superset of the SWA cache, so we can just check the SWA cache
    return kv_swa->seq_pos_min(seq_id);
 }
 llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
    return kv_swa->seq_pos_max(seq_id);
 }
 llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
    GGML_UNUSED(embd_all);
    // first try simple split
    do {
        balloc.split_reset();
        std::vector<llama_ubatch> ubatches;
        while (true) {
            auto ubatch = balloc.split_simple(n_ubatch);
            if (ubatch.n_tokens == 0) {
                break;
            }
            ubatches.push_back(std::move(ubatch)); // NOLINT
        }
        auto heads_base = kv_base->prepare(ubatches);
        if (heads_base.empty()) {
            break;
        }
        auto heads_swa = kv_swa->prepare(ubatches);
        if (heads_swa.empty()) {
            break;
        }
        assert(heads_base.size() == heads_swa.size());
        return std::make_unique<llama_kv_cache_unified_iswa_state>(
                this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
    } while (false);
    // if it fails, try equal split
    do {
        balloc.split_reset();
        std::vector<llama_ubatch> ubatches;
        while (true) {
            auto ubatch = balloc.split_equal(n_ubatch);
            if (ubatch.n_tokens == 0) {
                break;
            }
            ubatches.push_back(std::move(ubatch)); // NOLINT
        }
        auto heads_base = kv_base->prepare(ubatches);
        if (heads_base.empty()) {
            break;
        }
        auto heads_swa = kv_swa->prepare(ubatches);
        if (heads_swa.empty()) {
            break;
        }
        assert(heads_base.size() == heads_swa.size());
        return std::make_unique<llama_kv_cache_unified_iswa_state>(
                this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
    } while (false);
    // TODO: if we fail again, we should attempt different splitting strategies
    //       but to do that properly, we first have to refactor the batches to be more flexible
    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
 }
 llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
    return std::make_unique<llama_kv_cache_unified_iswa_state>(this);
 }
 llama_memory_state_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
    return std::make_unique<llama_kv_cache_unified_iswa_state>(this, lctx, optimize);
 }
 bool llama_kv_cache_unified_iswa::get_can_shift() const {
    return kv_base->get_size() == kv_swa->get_size();
 }
 void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
    kv_base->state_write(io, seq_id);
    kv_swa ->state_write(io, seq_id);
 }
 void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
    kv_base->state_read(io, seq_id);
    kv_swa ->state_read(io, seq_id);
 }
 llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
    return kv_base.get();
 }
 llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
    return kv_swa.get();
 }
 //
 // llama_kv_cache_unified_iswa_state
 //
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {}
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
        llama_kv_cache_unified_iswa * kv) :
    state_base(kv->get_base()->init_full()),
    state_swa (kv->get_swa ()->init_full()),
    status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) {
 }
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
        llama_kv_cache_unified_iswa * kv,
        llama_context * lctx,
        bool optimize) :
    state_base(kv->get_base()->init_update(lctx, optimize)),
    state_swa (kv->get_swa ()->init_update(lctx, optimize)),
    status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) {
 }
 llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
        llama_kv_cache_unified_iswa * kv,
        std::vector<uint32_t> heads_base,
        std::vector<uint32_t> heads_swa,
        std::vector<llama_ubatch> ubatches) :
    ubatches(std::move(ubatches)),
    // note: here we copy the ubatches. not sure if this is ideal
    state_base(new llama_kv_cache_unified_state(kv->get_base(), std::move(heads_base), this->ubatches)),
    state_swa (new llama_kv_cache_unified_state(kv->get_swa (), std::move(heads_swa),  this->ubatches)),
    status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) {
 }
 llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default;
 bool llama_kv_cache_unified_iswa_state::next() {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    state_base->next();
    state_swa ->next();
    if (++i_next >= ubatches.size()) {
        return false;
    }
    return true;
 }
 bool llama_kv_cache_unified_iswa_state::apply() {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    bool res = true;
    res = res & state_base->apply();
    res = res & state_swa ->apply();
    return res;
 }
 llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const {
    return status;
 }
 const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    return ubatches[i_next];
 }
 const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    return static_cast<const llama_kv_cache_unified_state *>(state_base.get());
 }
 const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa()  const {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    return static_cast<const llama_kv_cache_unified_state *>(state_swa.get());
 }
--- a/examples/talk-llama/llama-kv-cache-unified-iswa.h
+++ b/examples/talk-llama/llama-kv-cache-unified-iswa.h
@ -1,128 +0,0 @@
 #pragma once
 #include "llama-kv-cache-unified.h"
 #include <vector>
 //
 // llama_kv_cache_unified_iswa
 //
 // utilizes two instances of llama_kv_cache_unified
 //   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
 class llama_kv_cache_unified_iswa : public llama_memory_i {
 public:
    llama_kv_cache_unified_iswa(
            const llama_model & model,
                    ggml_type   type_k,
                    ggml_type   type_v,
                         bool   v_trans,
                         bool   offload,
                         bool   swa_full,
                     uint32_t   kv_size,
                     uint32_t   n_seq_max,
                     uint32_t   n_ubatch,
                     uint32_t   n_pad);
    ~llama_kv_cache_unified_iswa() = default;
    //
    // llama_memory_i
    //
    llama_memory_state_ptr init_batch(
            llama_batch_allocr & balloc,
            uint32_t n_ubatch,
            bool embd_all) override;
    llama_memory_state_ptr init_full() override;
    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
    bool get_can_shift() const override;
    void clear(bool data) override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id)                                                          override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
    //
    // llama_kv_cache_unified_iswa specific API
    //
    llama_kv_cache_unified * get_base() const;
    llama_kv_cache_unified * get_swa () const;
 private:
    const llama_hparams & hparams;
    std::unique_ptr<llama_kv_cache_unified> kv_base;
    std::unique_ptr<llama_kv_cache_unified> kv_swa;
 };
 class llama_kv_cache_unified_iswa_state : public llama_memory_state_i {
 public:
    // used for errors
    llama_kv_cache_unified_iswa_state(llama_memory_status status);
    // used to create a full-cache state
    llama_kv_cache_unified_iswa_state(
            llama_kv_cache_unified_iswa * kv);
    // used to create an update state
    llama_kv_cache_unified_iswa_state(
            llama_kv_cache_unified_iswa * kv,
            llama_context * lctx,
            bool optimize);
    // used to create a state from a batch
    llama_kv_cache_unified_iswa_state(
            llama_kv_cache_unified_iswa * kv,
            std::vector<uint32_t> heads_base,
            std::vector<uint32_t> heads_swa,
            std::vector<llama_ubatch> ubatches);
    virtual ~llama_kv_cache_unified_iswa_state();
    //
    // llama_memory_state_i
    //
    bool next()  override;
    bool apply() override;
    llama_memory_status  get_status() const override;
    const llama_ubatch & get_ubatch() const override;
    //
    // llama_kv_cache_unified_iswa_state specific API
    //
    const llama_kv_cache_unified_state * get_base() const;
    const llama_kv_cache_unified_state * get_swa()  const;
 private:
    //llama_kv_cache_unified_iswa * kv;
    // the index of the next ubatch to process
    size_t i_next = 0;
    std::vector<llama_ubatch> ubatches;
    const llama_memory_state_ptr state_base;
    const llama_memory_state_ptr state_swa;
    const llama_memory_status status;
 };
--- a/examples/talk-llama/llama-kv-cache-unified.cpp
+++ b/examples/talk-llama/llama-kv-cache-unified.cpp
--- a/examples/talk-llama/llama-kv-cache-unified.h
+++ b/examples/talk-llama/llama-kv-cache-unified.h
@ -1,303 +0,0 @@
 #pragma once
 #include "llama-batch.h"
 #include "llama-graph.h"
 #include "llama-kv-cells.h"
 #include "llama-memory.h"
 #include <unordered_map>
 #include <vector>
 struct llama_cparams;
 struct llama_hparams;
 struct llama_model;
 struct llama_context;
 //
 // llama_kv_cache_unified
 //
 class llama_kv_cache_unified : public llama_memory_i {
 public:
    static uint32_t get_padding(const llama_cparams & cparams);
    // this callback is used to filter out layers that should not be included in the cache
    using layer_filter_cb = std::function<bool(int32_t il)>;
    using ubatch_heads = std::vector<uint32_t>;
    struct defrag_info {
        bool empty() const {
            return ids.empty();
        }
        // contains information about which cell moves where:
        //  - cell i moves to ids[i]
        //  - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
        std::vector<uint32_t> ids;
    };
    llama_kv_cache_unified(
            const llama_model &  model,
              layer_filter_cb && filter,
                    ggml_type    type_k,
                    ggml_type    type_v,
                         bool    v_trans,
                         bool    offload,
                     uint32_t    kv_size,
                     uint32_t    n_seq_max,
                     uint32_t    n_pad,
                     uint32_t    n_swa,
               llama_swa_type    swa_type);
    ~llama_kv_cache_unified() = default;
    //
    // llama_memory_i
    //
    llama_memory_state_ptr init_batch(
            llama_batch_allocr & balloc,
            uint32_t n_ubatch,
            bool embd_all) override;
    llama_memory_state_ptr init_full() override;
    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
    bool get_can_shift() const override;
    void clear(bool data) override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id)                                                          override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
    //
    // llama_kv_cache_unified specific API
    //
    uint32_t get_size() const;
    bool get_has_shift() const;
    //
    // graph_build API
    //
    uint32_t get_n_kv() const;
    // get views of the current state of the cache
    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
    // store k_cur and v_cur in the cache based on the provided head location
    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const;
    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const;
    //
    // preparation API
    //
    // find places for the provided ubatches in the cache, returns the head locations
    // return empty vector on failure
    ubatch_heads prepare(const std::vector<llama_ubatch> & ubatches);
    bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
    // return the cell position where we can insert the ubatch
    // return -1 on failure to find a contiguous slot of kv cells
    int32_t find_slot(const llama_ubatch & ubatch) const;
    // emplace the ubatch context into slot: [head_cur, head_cur + ubatch.n_tokens)
    void apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch);
    //
    // set_input API
    //
    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
    void set_input_k_shift   (ggml_tensor * dst) const;
    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
 private:
    const llama_model & model;
    const llama_hparams & hparams;
    struct kv_layer {
        // layer index in the model
        // note: can be different from the layer index in the KV cache
        uint32_t il;
        ggml_tensor * k;
        ggml_tensor * v;
    };
    bool v_trans = true;  // the value tensor is transposed
    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
    uint32_t head = 0;
    const uint32_t n_seq_max = 1;
    // required padding
    const uint32_t n_pad = 1;
    // SWA
    const uint32_t n_swa = 0;
    int debug = 0;
    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
    llama_kv_cells_unified cells;
    std::vector<kv_layer> layers;
    // model layer id -> KV cache layer id
    std::unordered_map<int32_t, int32_t> map_layer_ids;
    // return non-empty vector if cells have been moved
    defrag_info defrag_prepare(int32_t n_max_nodes) const;
    size_t total_size() const;
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
    ggml_tensor * build_rope_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_tensor * cur,
                    ggml_tensor * shift,
                    ggml_tensor * factors,
                          float   freq_base,
                          float   freq_scale) const;
    llm_graph_result_ptr build_graph_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_cgraph * gf) const;
    llm_graph_result_ptr build_graph_defrag(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_cgraph * gf,
              const defrag_info & dinfo) const;
    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 class llama_kv_cache_unified_state : public llama_memory_state_i {
 public:
    // some shorthands
    using ubatch_heads = llama_kv_cache_unified::ubatch_heads;
    using defrag_info  = llama_kv_cache_unified::defrag_info;
    // used for errors
    llama_kv_cache_unified_state(llama_memory_status status);
    // used to create a full-cache state
    llama_kv_cache_unified_state(
            llama_kv_cache_unified * kv);
    // used to create an update state
    llama_kv_cache_unified_state(
            llama_kv_cache_unified * kv,
            llama_context * lctx,
            bool do_shift,
            defrag_info dinfo);
    // used to create a decode state from a batch
    llama_kv_cache_unified_state(
            llama_kv_cache_unified * kv,
            ubatch_heads heads,
            std::vector<llama_ubatch> ubatches);
    virtual ~llama_kv_cache_unified_state();
    //
    // llama_memory_state_i
    //
    bool next()  override;
    bool apply() override;
    llama_memory_status  get_status() const override;
    const llama_ubatch & get_ubatch() const override;
    //
    // llama_kv_cache_unified_state specific API
    //
    uint32_t get_n_kv() const;
    // get views of the current state of the cache
    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
    // store k_cur and v_cur in the cache based on the provided head location
    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
    void set_input_k_shift(ggml_tensor * dst) const;
    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
 private:
    llama_memory_status status;
    llama_kv_cache_unified * kv;
    llama_context * lctx;
    //
    // update state
    //
    bool do_shift = false;
    defrag_info dinfo;
    //
    // batch processing state
    //
    // the index of the next ubatch to process
    size_t i_next = 0;
    ubatch_heads heads;
    std::vector<llama_ubatch> ubatches;
    //
    // data needed for building the compute graph for the current ubatch:
    //
    // a heuristic, to avoid attending the full cache if it is not yet utilized
    // as the cache gets filled, the benefit from this heuristic disappears
    int32_t n_kv;
    // the beginning of the current slot in which the ubatch will be inserted
    int32_t head;
 };
--- a/examples/talk-llama/llama-kv-cache.cpp
+++ b/examples/talk-llama/llama-kv-cache.cpp
--- a/examples/talk-llama/llama-kv-cache.h
+++ b/examples/talk-llama/llama-kv-cache.h
@ -2,36 +2,57 @@
 #include "llama.h"
 #include "llama-io.h"
 #include "llama-graph.h"
 #include "llama-memory.h"
 #include "ggml-cpp.h"
 #include <set>
 #include <vector>
 struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
 struct llama_sbatch;
 struct llama_model;
 struct llama_context;
 struct llama_kv_cache : public llama_memory_i {
    virtual ~llama_kv_cache() = default;
-    // split the input batch into a set of ubatches and verify that they can fit into the cache
+    // call if batch processing fails - restores the cache state
-    // return a state object containing the ubatches and KV cache state required to process them
+    virtual void restore() = 0;
    // check the llama_memory_state_i::get_status() for the result
    virtual llama_memory_state_ptr init_batch(
            const llama_batch & batch,
            uint32_t n_ubatch,
            bool embd_pooled,
            bool logits_all) = 0;
-    // simulate full cache, used for allocating worst-case compute buffers
+    // call after successful batch processing - clears any pending state
-    virtual llama_memory_state_ptr init_full() = 0;
+    virtual void commit()  = 0;
    // process any pending defrag/shift/etc. operations
    // optionally call once before processing a new batch
    // return true if any operations were performed
    virtual bool update(llama_context & lctx) = 0;
    // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
    // TODO: change to
    //   llama_memory_state_ptr init_defrag(float thold) = 0;
    //
    virtual void defrag_sched(float thold) = 0;
    // simulate full cache, used for allocating worst-case compute buffers
    virtual void set_full() = 0;
    //
    // batch processing
    //
    virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
    // different KV caches require different batch splitting strategies
    virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
    // find an empty slot of size "n_tokens" in the cache
    virtual bool find_slot(const llama_ubatch & batch) = 0;
    // getters
-    virtual bool get_can_shift() const = 0;
+    virtual int32_t   get_n_tokens()   const = 0;
    virtual int32_t   get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
    virtual llama_pos get_pos_max()    const = 0;
    virtual bool      get_can_shift()  const = 0;
    bool get_can_edit() const override { return get_can_shift(); }
@ -42,3 +63,343 @@ struct llama_kv_cache : public llama_memory_i {
    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
 };
 //
 // llama_kv_cache_guard
 //
 struct llama_kv_cache_guard {
    llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
    ~llama_kv_cache_guard() {
        kv->restore();
    }
    void commit() {
        kv->commit();
    }
 private:
    llama_kv_cache * kv;
 };
 //
 // llama_kv_cache_unified
 //
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
    struct kv_cell {
        llama_pos pos   = -1;
        llama_pos delta =  0;
        std::set<llama_seq_id> seq_id;
        bool has_seq_id(const llama_seq_id & id) const {
            return seq_id.find(id) != seq_id.end();
        }
        bool is_empty() const {
            return seq_id.empty();
        }
        bool is_same_seq(const kv_cell & other) const {
            return seq_id == other.seq_id;
        }
    };
    static uint32_t get_padding(const llama_cparams & cparams);
    llama_kv_cache_unified(
            const llama_model & model,
                    ggml_type   type_k,
                    ggml_type   type_v,
                         bool   v_trans,
                         bool   offload,
                     uint32_t   kv_size,
                     uint32_t   padding);
    ~llama_kv_cache_unified() = default;
    //
    // llama_memory_i
    //
    void clear() override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id) override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    //
    // llama_kv_cache
    //
    void restore() override;
    void commit()  override;
    bool update(llama_context & ctx) override;
    void defrag_sched(float thold) override;
    void set_full() override;
    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
    // updates the cache head
    // Note: On success, it's important that cache.head points
    // to the first cell of the slot.
    bool find_slot(const llama_ubatch & batch) override;
    int32_t get_n_tokens()   const override;
    int32_t get_used_cells() const override;
    // TODO: better data structures to reduce the cost of this operation
    llama_pos get_pos_max() const override;
    bool get_can_shift() const override;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_impl also uses it, so it
    // cannot be freely changed after a slot has been allocated.
    uint32_t head = 0;
    uint32_t size = 0;
    uint32_t used = 0; // used cells (i.e. at least one seq_id)
    // computed before each graph build
    uint32_t n = 0;
    std::vector<kv_cell> cells;
    std::vector<ggml_tensor *> k_l; // per layer
    std::vector<ggml_tensor *> v_l;
 private:
    const llama_model & model;
    const llama_hparams & hparams;
    bool has_shift = false;
    bool do_defrag = false;
    bool v_trans   = true;  // the value tensor is transposed
    bool can_shift = false;
    // required padding
    uint32_t padding = 1;
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
    // defrag
    struct {
        std::vector<uint32_t> ids;
    } defrag_info;
    // return true if cells have been moved
    bool defrag_prepare(int32_t n_max_nodes);
    // commit/restore cache
    struct slot_range {
        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
        uint32_t c1 = 0;
    };
    // pending cell updates that are not yet committed
    struct {
        std::vector<slot_range> ranges;
    } pending;
    // find how many cells are currently in use
    uint32_t cell_max() const;
    size_t total_size() const;
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    ggml_tensor * build_rope_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_tensor * cur,
                    ggml_tensor * shift,
                    ggml_tensor * factors,
                          float   freq_base,
                          float   freq_scale) const;
    llm_graph_result_ptr build_graph_shift(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_cgraph * gf) const;
    llm_graph_result_ptr build_graph_defrag(
            const llama_cparams & cparams,
                   ggml_context * ctx,
                    ggml_cgraph * gf) const;
    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 //
 // llama_kv_cache_recurrent
 //
 class llama_kv_cache_recurrent : public llama_kv_cache {
 public:
    struct kv_cell {
        llama_pos pos  = -1;
        int32_t   src  = -1; // used to copy states
        int32_t   tail = -1;
        std::set<llama_seq_id> seq_id;
        bool has_seq_id(const llama_seq_id & id) const {
            return seq_id.find(id) != seq_id.end();
        }
        bool is_empty() const {
            return seq_id.empty();
        }
        bool is_same_seq(const kv_cell & other) const {
            return seq_id == other.seq_id;
        }
    };
    llama_kv_cache_recurrent(
            const llama_model & model,
                    ggml_type   type_k,
                    ggml_type   type_v,
                         bool   offload,
                     uint32_t   kv_size);
    ~llama_kv_cache_recurrent() = default;
    //
    // llama_memory_i
    //
    void clear() override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id) override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    //
    // llama_kv_cache
    //
    void restore() override;
    void commit()  override;
    bool update(llama_context & lctx) override;
    void defrag_sched(float thold) override;
    void set_full() override;
    llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
    llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
    bool find_slot(const llama_ubatch & batch) override;
    int32_t get_n_tokens()   const override;
    int32_t get_used_cells() const override;
    // TODO: better data structures to reduce the cost of this operation
    llama_pos get_pos_max() const override;
    bool get_can_shift() const override;
    // TODO: temporary methods - they are not really const as they do const_cast<>, fix this
    int32_t s_copy(int i) const;
    float   s_mask(int i) const;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
    // Note: The value of head isn't only used to optimize searching
    // for a free KV slot. llama_decode_impl also uses it, so it
    // cannot be freely changed after a slot has been allocated.
    uint32_t head = 0;
    uint32_t size = 0;
    uint32_t used = 0; // used cells (i.e. at least one seq_id)
    // computed before each graph build
    uint32_t n = 0;
    std::vector<kv_cell> cells;
    std::vector<ggml_tensor *> k_l; // per layer
    std::vector<ggml_tensor *> v_l;
 private:
    //const llama_model & model;
    const llama_hparams & hparams;
    // commit/restore cache
    // TODO: rework for recurrent cache
    struct slot_range {
        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
        uint32_t c1 = 0;
    };
    // pending cell updates that are not yet committed
    struct {
        std::vector<slot_range> ranges;
    } pending;
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
    // find how many cells are currently in use
    uint32_t cell_max() const;
    size_t total_size() const;
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 //
 // kv cache view
 //
 llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
 void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
--- a/examples/talk-llama/llama-kv-cells.h
+++ b/examples/talk-llama/llama-kv-cells.h
@ -1,415 +0,0 @@
 #pragma once
 #include "llama.h"
 #include "llama-cparams.h"
 #include <bitset>
 #include <cassert>
 #include <vector>
 #include <set>
 // meta information about KV cells that can be part of multiple sequences at the same time
 // TODO: add unit tests
 class llama_kv_cells_unified {
 public:
    void reset() {
        for (uint32_t i = 0; i < pos.size(); ++i) {
            pos[i]   = -1;
            shift[i] =  0;
            seq[i].reset();
        }
        has_shift = false;
        used.clear();
        for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
            seq_pos[s].clear();
        }
    }
    void reset_shift() {
        has_shift = false;
        for (uint32_t i = 0; i < shift.size(); ++i) {
            shift[i] = 0;
        }
    }
    uint32_t size() const {
        return pos.size();
    }
    void resize(uint32_t n) {
        pos.resize(n);
        shift.resize(n);
        seq.resize(n);
        reset();
    }
    bool is_empty(uint32_t i) const {
        assert(i < pos.size());
        assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
        return pos[i] == -1;
    }
    uint32_t get_used() const {
        return used.size();
    }
    // the index of the first cell that is used
    // return 0 if no cells are used
    uint32_t used_min() const {
        return used.empty() ? 0 : *used.begin();
    }
    // the index of the last cell that is used + 1
    // return 0 if no cells are used
    uint32_t used_max_p1() const {
        return used.empty() ? 0 : *used.rbegin() + 1;
    }
    bool get_has_shift() const {
        return has_shift;
    }
    // move cell isrc to idst (used during defrag)
    void mv(uint32_t isrc, uint32_t idst) {
        assert(isrc < pos.size());
        assert(idst < pos.size());
        assert(pos[idst] == -1);
        assert(pos[isrc] != -1);
        pos  [idst] = pos  [isrc];
        shift[idst] = shift[isrc];
        seq  [idst] = seq  [isrc];
        pos  [isrc] = -1;
        shift[isrc] =  0;
        seq  [isrc].reset();
        used.erase (isrc);
        used.insert(idst);
    }
    // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
    llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
        assert(i + n <= pos.size());
        llama_kv_cells_unified res;
        res.resize(n);
        for (uint32_t j = 0; j < n; ++j) {
            res.pos[j] = pos[i + j];
            res.seq[j] = seq[i + j];
            assert(shift[i + j] == 0);
        }
        return res;
    }
    // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
    void set(uint32_t i, const llama_kv_cells_unified & other) {
        assert(i + other.pos.size() <= pos.size());
        for (uint32_t j = 0; j < other.pos.size(); ++j) {
            if (pos[i + j] == -1 && other.pos[j] != -1) {
                used.insert(i + j);
            }
            if (pos[i + j] != -1 && other.pos[j] == -1) {
                used.erase(i + j);
            }
            if (pos[i + j] != -1) {
                seq_pos_rm(i + j);
            }
            pos[i + j] = other.pos[j];
            seq[i + j] = other.seq[j];
            if (pos[i + j] != -1) {
                seq_pos_add(i + j);
            }
            assert(shift[i + j] == 0);
        }
    }
    // clear a non-empty cell
    void rm(uint32_t i) {
        assert(i < pos.size());
        assert(pos[i] != -1);
        seq_pos_rm(i);
        seq[i].reset();
        pos[i] = -1;
        shift[i] = 0;
        used.erase(i);
    }
    // note: call only if the cell has seq_id
    // return true if the cell becomes empty
    bool seq_rm(uint32_t i, llama_seq_id seq_id) {
        assert(i < pos.size());
        assert(seq[i].test(seq_id));
        assert(pos[i] != -1);
        assert(seq_id >= 0);
        seq[i].reset(seq_id);
        seq_pos[seq_id].erase(pos[i]);
        if (seq[i].none()) {
            pos[i] = -1;
            shift[i] = 0;
            used.erase(i);
            return true;
        }
        return false;
    }
    // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
    bool seq_keep(uint32_t i, llama_seq_id seq_id) {
        assert(i < pos.size());
        if (seq[i].test(seq_id)) {
            seq_pos_rm(i);
            seq[i].reset();
            seq[i].set(seq_id);
            seq_pos[seq_id].insert(pos[i]);
            return false;
        }
        if (seq[i].any()) {
            seq_pos_rm(i);
            seq[i].reset();
            pos[i] = -1;
            shift[i] = 0;
            used.erase(i);
            return true;
        }
        assert(pos[i] == -1);
        return false;
    }
    // number of different sequences in the cell
    int seq_count(uint32_t i) const {
        assert(i < pos.size());
        assert(pos[i] != -1);
        return seq[i].count();
    }
    // check if the cell contains seq_id
    bool seq_has(uint32_t i, llama_seq_id seq_id) const {
        assert(i < pos.size());
        assert(seq_id >= 0);
        return seq[i].test(seq_id);
    }
    // note: call only if the cell is not empty and the seq_id is not in the cell
    void seq_add(uint32_t i, llama_seq_id seq_id) {
        assert(i < pos.size());
        assert(pos[i] != -1);
        assert(!seq[i].test(seq_id));
        seq[i].set(seq_id);
        seq_pos[seq_id].insert(pos[i]);
    }
    // return the sequence id of this cell
    // note: call only for cells with exactly one sequence
    llama_seq_id seq_get(uint32_t i) const {
        assert(seq[i].count() == 1);
        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
            if (seq[i].test(s)) {
                return s;
            }
        }
        return -1;
    }
    // the minimum position of sequence seq_id currently present in any of the cells
    // return -1 if the sequence is not present
    llama_pos seq_pos_min(llama_seq_id seq_id) const {
        assert(seq_id >= 0);
        assert(seq_id < LLAMA_MAX_SEQ);
        if (seq_pos[seq_id].empty()) {
            return -1;
        }
        return *seq_pos[seq_id].begin();
    }
    // the maximum position of sequence seq_id currently present in any of the cells
    // return -1 if the sequence is not present
    llama_pos seq_pos_max(llama_seq_id seq_id) const {
        assert(seq_id >= 0);
        assert(seq_id < LLAMA_MAX_SEQ);
        if (seq_pos[seq_id].empty()) {
            return -1;
        }
        return *seq_pos[seq_id].rbegin();
    }
    // note: call only if the cell is not empty
    llama_pos pos_get(uint32_t i) const {
        assert(i < pos.size());
        assert(pos[i] != -1);
        return pos[i];
    }
    // note: call only if the cell is not empty
    llama_pos get_shift(uint32_t i) const {
        assert(i < pos.size());
        assert(pos[i] != -1);
        return shift[i];
    }
    // check if a cell is not empty and its position is within [p0, p1)
    bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
        assert(i < pos.size());
        return pos[i] >= p0 && pos[i] < p1;
    }
    // set the position of an empty cell
    // does not modify "has_shift"
    // note: call only if the cell is empty
    void pos_set(uint32_t i, llama_pos p) {
        assert(i < pos.size());
        assert(pos[i] == -1);
        assert(seq[i].none());
        pos[i] = p;
        used.insert(i);
    }
    // pos[i] = pos[i] + d
    // sets "has_shift" to true
    // note: call only if the cell is not empty
    bool pos_add(uint32_t i, llama_pos d) {
        assert(i < pos.size());
        assert(pos[i] != -1);
        seq_pos_rm(i);
        pos[i]   += d;
        shift[i] += d;
        has_shift = true;
        if (pos[i] < 0) {
            seq[i].reset();
            pos[i] = -1;
            shift[i] = 0;
            used.erase(i);
            return true;
        }
        seq_pos_add(i);
        return false;
    }
    // pos[i] = pos[i] / d
    // sets "has_shift" to true
    // note: call only if the cell is not empty
    void pos_div(uint32_t i, int d) {
        assert(i < pos.size());
        assert(pos[i] != -1);
        const llama_pos p_old = pos[i];
        seq_pos_rm(i);
        pos[i]   /= d;
        shift[i] += p_old - pos[i];
        seq_pos_add(i);
        has_shift = true;
    }
 private:
    bool has_shift = false;
    // set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
    std::set<uint32_t> used;
    std::vector<llama_pos> pos;
    // this array accumulates any applied shifts to the pos array since the last reset_shift() call
    // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
    //
    //   cells.pos_add(x, shift_x);
    //   cells.pos_div(y, shift_y);
    //   ...
    //
    //   if (cells.has_shift()) {
    //      for (int i = 0; i < n; ++i) {
    //          auto shift_i = cells.get_shift(i);
    //          ...
    //      }
    //      cells.reset_shift();
    //   }
    //
    std::vector<llama_pos> shift;
    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
    // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
    std::vector<seq_set_t> seq;
    // the set seq_pos[s] tells us which positions are currently present for sequence s
    // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
    std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
    // helper functions for updating `seq_pos`, once cell at a time:
    // remove cell i
    void seq_pos_rm(uint32_t i) {
        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
            if (seq[i].test(s)) {
                seq_pos[s].erase(pos[i]);
            }
        }
    }
    // add cell i
    void seq_pos_add(uint32_t i) {
        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
            if (seq[i].test(s)) {
                seq_pos[s].insert(pos[i]);
            }
        }
    }
 };
--- a/examples/talk-llama/llama-memory-hybrid.cpp
+++ b/examples/talk-llama/llama-memory-hybrid.cpp
@ -1,246 +0,0 @@
 #include "llama-memory-hybrid.h"
 #include "llama-impl.h"
 #include "llama-model.h"
 #include "llama-context.h"
 //
 // llama_memory_hybrid
 //
 llama_memory_hybrid::llama_memory_hybrid(
    const llama_model & model,
                         /* attn */
            ggml_type    type_k,
            ggml_type    type_v,
                 bool    v_trans,
             uint32_t    kv_size,
             uint32_t    n_pad,
             uint32_t    n_swa,
       llama_swa_type    swa_type,
                         /* recurrent */
            ggml_type    type_r,
            ggml_type    type_s,
             uint32_t    rs_size,
                         /* common */
             uint32_t    n_seq_max,
                 bool    offload,
                         /* layer filters */
      layer_filter_cb && filter_attn,
      layer_filter_cb && filter_recr) :
    hparams(model.hparams),
    mem_attn(new llama_kv_cache_unified(
        model,
        filter_attn == nullptr ?
            [&](int32_t il) { return !hparams.is_recurrent(il); }
            : filter_attn,
        type_k,
        type_v,
        v_trans,
        offload,
        kv_size,
        n_seq_max,
        n_pad,
        n_swa,
        swa_type
    )),
    mem_recr(new llama_memory_recurrent(
        model,
        filter_recr == nullptr ?
            [&](int32_t il) { return hparams.is_recurrent(il); }
            : filter_recr,
        type_r,
        type_s,
        offload,
        rs_size,
        n_seq_max
    )) {}
 llama_memory_state_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
    do {
        balloc.split_reset();
        // follow the recurrent pattern for creating the ubatch splits
        std::vector<llama_ubatch> ubatches;
        while (true) {
            llama_ubatch ubatch;
            if (embd_all) {
                // if all tokens are output, split by sequence
                ubatch = balloc.split_seq(n_ubatch);
            } else {
                ubatch = balloc.split_equal(n_ubatch);
            }
            if (ubatch.n_tokens == 0) {
                break;
            }
            ubatches.push_back(std::move(ubatch)); // NOLINT
        }
        // prepare the recurrent batches first
        if (!mem_recr->prepare(ubatches)) {
            // TODO: will the recurrent cache be in an undefined state at this point?
            LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
            return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
        }
        // prepare the attention cache
        auto heads_attn = mem_attn->prepare(ubatches);
        if (heads_attn.empty()) {
            LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
            return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
        }
        return std::make_unique<llama_memory_hybrid_state>(
                this, std::move(heads_attn), std::move(ubatches));
    } while(false);
    return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
 }
 llama_memory_state_ptr llama_memory_hybrid::init_full() {
    return std::make_unique<llama_memory_hybrid_state>(this);
 }
 llama_memory_state_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
    return std::make_unique<llama_memory_hybrid_state>(this, lctx, optimize);
 }
 bool llama_memory_hybrid::get_can_shift() const {
    // Shifting is trivially supported for recurrent
    return mem_attn->get_can_shift();
 }
 void llama_memory_hybrid::clear(bool data) {
    mem_attn->clear(data);
    mem_recr->clear(data);
 }
 bool llama_memory_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
    // Try removing from the recurrent cache first since it may fail. If it does
    // fail, the cache will not have been mutated.
    if (!mem_recr->seq_rm(seq_id, p0, p1)) {
        return false;
    }
    return mem_attn->seq_rm(seq_id, p0, p1);
 }
 void llama_memory_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
    mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
    mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
 }
 void llama_memory_hybrid::seq_keep(llama_seq_id seq_id) {
    mem_attn->seq_keep(seq_id);
    mem_recr->seq_keep(seq_id);
 }
 void llama_memory_hybrid::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
    mem_attn->seq_add(seq_id, p0, p1, shift);
    mem_recr->seq_add(seq_id, p0, p1, shift);
 }
 void llama_memory_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
    mem_attn->seq_div(seq_id, p0, p1, d);
    mem_recr->seq_div(seq_id, p0, p1, d);
 }
 llama_pos llama_memory_hybrid::seq_pos_min(llama_seq_id seq_id) const {
    // the min of the total cache is the max of the two caches' min values
    return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
 }
 llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
    // the max of the total cache is the min of the two caches' max values
    return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
 }
 void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
    mem_attn->state_write(io, seq_id);
    mem_recr->state_write(io, seq_id);
 }
 void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
    mem_attn->state_read(io, seq_id);
    mem_recr->state_read(io, seq_id);
 }
 llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
    return mem_attn.get();
 }
 llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const {
    return mem_recr.get();
 }
 llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_status status) : status(status) {}
 llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_hybrid * mem) :
    state_attn(mem->get_mem_attn()->init_full()),
    state_recr(mem->get_mem_recr()->init_full()),
    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
 }
 llama_memory_hybrid_state::llama_memory_hybrid_state(
        llama_memory_hybrid * mem,
              llama_context * lctx,
                       bool   optimize) :
    state_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
    state_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
 }
 llama_memory_hybrid_state::llama_memory_hybrid_state(
              llama_memory_hybrid * mem,
            std::vector<uint32_t>   heads_attn,
        std::vector<llama_ubatch>   ubatches) :
    ubatches(std::move(ubatches)),
    // note: here we copy the ubatches. not sure if this is ideal
    state_attn(new llama_kv_cache_unified_state(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
    state_recr(new llama_memory_recurrent_state(mem->get_mem_recr(),                        this->ubatches)),
    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
 }
 bool llama_memory_hybrid_state::next() {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    state_attn->next();
    state_recr->next();
    if (++i_next >= ubatches.size()) {
        return false;
    }
    return true;
 }
 bool llama_memory_hybrid_state::apply() {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    bool res = true;
    res = res & state_attn->apply();
    res = res & state_recr->apply();
    return res;
 }
 llama_memory_status llama_memory_hybrid_state::get_status() const {
    return status;
 }
 const llama_ubatch & llama_memory_hybrid_state::get_ubatch() const {
    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
    return ubatches[i_next];
 }
 const llama_kv_cache_unified_state * llama_memory_hybrid_state::get_state_attn() const {
    return static_cast<const llama_kv_cache_unified_state *>(state_attn.get());
 }
 const llama_memory_recurrent_state * llama_memory_hybrid_state::get_state_recr() const {
    return static_cast<const llama_memory_recurrent_state *>(state_recr.get());
 }
--- a/examples/talk-llama/llama-memory-hybrid.h
+++ b/examples/talk-llama/llama-memory-hybrid.h
@ -1,138 +0,0 @@
 #pragma once
 #include "llama-batch.h"
 #include "llama-graph.h"
 #include "llama-kv-cache-unified.h"
 #include "llama-memory.h"
 #include "llama-memory-recurrent.h"
 #include <memory>
 #include <vector>
 //
 // llama_memory_hybrid
 //
 // utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
 //   support models where each layer may be either attention-based or recurrent
 class llama_memory_hybrid : public llama_memory_i {
 public:
    // this callback is used to filter out layers that should not be included in the cache
    using layer_filter_cb = std::function<bool(int32_t il)>;
    llama_memory_hybrid(
        const llama_model & model,
                            /* attn */
                ggml_type    type_k,
                ggml_type    type_v,
                     bool    v_trans,
                 uint32_t    kv_size,
                 uint32_t    n_pad,
                 uint32_t    n_swa,
           llama_swa_type    swa_type,
                             /* recurrent */
                ggml_type    type_r,
                ggml_type    type_s,
                 uint32_t    rs_size,
                             /* common */
                 uint32_t    n_seq_max,
                     bool    offload,
                             /* layer filters */
          layer_filter_cb && filter_attn = nullptr,
          layer_filter_cb && filter_recr = nullptr);
    ~llama_memory_hybrid() = default;
    //
    // llama_memory_i
    //
    llama_memory_state_ptr init_batch(
            llama_batch_allocr & balloc,
            uint32_t n_ubatch,
            bool embd_all) override;
    llama_memory_state_ptr init_full() override;
    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
    bool get_can_shift() const override;
    void clear(bool data) override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id)                                                          override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
    //
    // llama_memory_hybrid specific API
    //
    llama_kv_cache_unified * get_mem_attn() const;
    llama_memory_recurrent * get_mem_recr() const;
 private:
    const llama_hparams & hparams;
    const std::unique_ptr<llama_kv_cache_unified> mem_attn;
    const std::unique_ptr<llama_memory_recurrent> mem_recr;
 };
 class llama_memory_hybrid_state : public llama_memory_state_i {
 public:
    // init failure
    explicit llama_memory_hybrid_state(llama_memory_status status);
    // init full
    explicit llama_memory_hybrid_state(llama_memory_hybrid * mem);
    // init update
    explicit llama_memory_hybrid_state(
        llama_memory_hybrid * mem,
              llama_context * lctx,
                       bool   optimize);
    // init success
    llama_memory_hybrid_state(
              llama_memory_hybrid * mem,
            std::vector<uint32_t>   heads_attn,
        std::vector<llama_ubatch>   ubatches);
    ~llama_memory_hybrid_state() = default;
    bool next()  override;
    bool apply() override;
    llama_memory_status  get_status() const override;
    const llama_ubatch & get_ubatch() const override;
    //
    // llama_memory_hybrid_state
    //
    const llama_kv_cache_unified_state * get_state_attn() const;
    const llama_memory_recurrent_state * get_state_recr() const;
 private:
    // the index of the next ubatch to process
    size_t i_next = 0;
    std::vector<llama_ubatch> ubatches;
    const llama_memory_state_ptr state_attn;
    const llama_memory_state_ptr state_recr;
    const llama_memory_status status;
 };
--- a/examples/talk-llama/llama-memory-recurrent.cpp
+++ b/examples/talk-llama/llama-memory-recurrent.cpp
--- a/examples/talk-llama/llama-memory-recurrent.h
+++ b/examples/talk-llama/llama-memory-recurrent.h
@ -1,183 +0,0 @@
 #pragma once
 #include "llama-batch.h"
 #include "llama-graph.h"
 #include "llama-memory.h"
 #include <set>
 #include <vector>
 //
 // llama_memory_recurrent
 //
 // TODO: extract the cache state used for graph computation into llama_memory_recurrent_state_i
 //       see the implementation of llama_kv_cache_unified_state_i for an example how to do it
 class llama_memory_recurrent : public llama_memory_i {
 public:
    // this callback is used to filter out layers that should not be included in the cache
    using layer_filter_cb = std::function<bool(int32_t il)>;
    llama_memory_recurrent(
            const llama_model &  model,
              layer_filter_cb && filter,
                    ggml_type    type_r,
                    ggml_type    type_s,
                         bool    offload,
                     uint32_t    mem_size,
                     uint32_t    n_seq_max);
    ~llama_memory_recurrent() = default;
    //
    // llama_memory_i
    //
    llama_memory_state_ptr init_batch(
            llama_batch_allocr & balloc,
            uint32_t n_ubatch,
            bool embd_all) override;
    llama_memory_state_ptr init_full() override;
    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
    void clear(bool data) override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id)                                                          override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
    bool prepare(const std::vector<llama_ubatch> & ubatches);
    // find a contiguous slot of memory cells and emplace the ubatch there
    bool find_slot(const llama_ubatch & ubatch);
    bool get_can_shift() const override;
    // state write/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) override;
    uint32_t head = 0; // the location where the batch will be placed in the cache (see find_slot())
    uint32_t size = 0; // total number of cells, shared across all sequences
    uint32_t used = 0; // used cells (i.e. at least one seq_id)
    // computed before each graph build
    uint32_t n = 0;
    // first zero-ed state
    int32_t rs_z = -1;
    // TODO: optimize for recurrent state needs
    struct mem_cell {
        llama_pos pos  = -1;
        int32_t   src  = -1; // used to know where states should be copied from
        int32_t   src0 = -1; // like src, but only used when setting the inputs (allowing to copy once)
        int32_t   tail = -1;
        std::set<llama_seq_id> seq_id;
        bool has_seq_id(const llama_seq_id & id) const {
            return seq_id.find(id) != seq_id.end();
        }
        bool is_empty() const {
            return seq_id.empty();
        }
        bool is_same_seq(const mem_cell & other) const {
            return seq_id == other.seq_id;
        }
    };
    std::vector<mem_cell> cells;
    // per layer
    std::vector<ggml_tensor *> r_l;
    std::vector<ggml_tensor *> s_l;
 private:
    //const llama_model & model;
    const llama_hparams & hparams;
    const uint32_t n_seq_max = 1;
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
    size_t total_size() const;
    size_t size_r_bytes() const;
    size_t size_s_bytes() const;
    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
 };
 class llama_memory_recurrent_state : public llama_memory_state_i {
 public:
    // used for errors
    llama_memory_recurrent_state(llama_memory_status status);
    // used to create a full-cache state
    llama_memory_recurrent_state(
            llama_memory_recurrent * mem);
    // used to create a state from a batch
    llama_memory_recurrent_state(
            llama_memory_recurrent * mem,
            std::vector<llama_ubatch> ubatches);
    virtual ~llama_memory_recurrent_state();
    //
    // llama_memory_state_i
    //
    bool next()  override;
    bool apply() override;
    llama_memory_status  get_status() const override;
    const llama_ubatch & get_ubatch() const override;
    //
    // llama_memory_recurrent_state specific API
    //
    uint32_t get_n_rs() const;
    uint32_t get_head() const;
    int32_t  get_rs_z() const;
    uint32_t get_size() const;
    ggml_tensor * get_r_l(int32_t il) const;
    ggml_tensor * get_s_l(int32_t il) const;
    int32_t s_copy(int i) const;
 private:
    const llama_memory_status status;
    llama_memory_recurrent * mem;
    size_t i_next = 0;
    std::vector<llama_ubatch> ubatches;
    //
    // data needed for building the compute graph for the current ubatch:
    // TODO: extract all the state like `head` and `n` here
    //
    const bool is_full = false;
 };
--- a/examples/talk-llama/llama-memory.cpp
+++ b/examples/talk-llama/llama-memory.cpp
@ -1,42 +1 @@
 #include "llama-memory.h"
 llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1) {
    bool has_update = false;
    switch (s0) {
        case LLAMA_MEMORY_STATUS_SUCCESS:
            {
                has_update = true;
                break;
            }
        case LLAMA_MEMORY_STATUS_NO_UPDATE:
            {
                break;
            }
        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
            {
                return s0;
            }
    }
    switch (s1) {
        case LLAMA_MEMORY_STATUS_SUCCESS:
            {
                has_update = true;
                break;
            }
        case LLAMA_MEMORY_STATUS_NO_UPDATE:
            {
                break;
            }
        case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
        case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
            {
                return s1;
            }
    }
    // if either status has an update, then the combined status has an update
    return has_update ? LLAMA_MEMORY_STATUS_SUCCESS : LLAMA_MEMORY_STATUS_NO_UPDATE;
 }
--- a/examples/talk-llama/llama-memory.h
+++ b/examples/talk-llama/llama-memory.h
@ -2,115 +2,30 @@
 #include "llama.h"
 #include <memory>
 #include <vector>
 struct llama_ubatch;
 class llama_batch_allocr;
 class llama_io_write_i;
 class llama_io_read_i;
 struct llama_memory_params {
    // kv cache
    ggml_type type_k;
    ggml_type type_v;
-    // use full-size SWA cache
+    // parameters for other types of memory
-    bool swa_full;
+    // ...
 };
 enum llama_memory_status {
    LLAMA_MEMORY_STATUS_SUCCESS = 0,
    LLAMA_MEMORY_STATUS_NO_UPDATE,
    LLAMA_MEMORY_STATUS_FAILED_PREPARE,
    LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
 };
 // helper function for combining the status of two memory states
 // useful for implementing hybrid memory types (e.g. iSWA)
 llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);
 // the interface for managing the memory state during batch processing
 // this interface is implemented per memory type. see:
 //   - llama_kv_cache_unified_state
 //   - llama_kv_cache_unified_iswa_state
 //   ...
 //
 // the only method that can mutate the memory and the memory state is llama_memory_i::apply()
 //
 // TODO: rename to llama_memory_context_i ?
 struct llama_memory_state_i {
    virtual ~llama_memory_state_i() = default;
    // consume the current ubatch from the state and proceed to the next one
    // return false if we are done
    virtual bool next() = 0;
    // apply the memory state for the current ubatch to the memory object
    // return false on failure
    virtual bool apply() = 0;
    // get the current ubatch
    virtual const llama_ubatch & get_ubatch() const = 0;
    // get the status of the memory state - used for error handling and checking if any updates would be applied
    virtual llama_memory_status get_status() const = 0;
 };
 using llama_memory_state_ptr = std::unique_ptr<llama_memory_state_i>;
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
-struct llama_memory_i {
+class llama_memory_i {
 public:
    virtual ~llama_memory_i() = default;
-    // split the input batch into a set of ubatches and verify that they can fit into the cache
+    virtual void clear() = 0;
    // return a state object containing the ubatches and KV cache state required to process them
    // check the llama_memory_state_i::get_status() for the result
    virtual llama_memory_state_ptr init_batch(
            llama_batch_allocr & balloc,
            uint32_t n_ubatch,
            bool embd_all) = 0;
    // simulate full cache, used for allocating worst-case compute buffers
    virtual llama_memory_state_ptr init_full() = 0;
    // prepare for any pending memory updates, such as shifts, defrags, etc.
    // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
    virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0;
    // getters
    virtual bool get_can_shift() const = 0;
    //
    // ops
    //
    // if data == true, the data buffers will also be cleared together with the metadata
    virtual void clear(bool data) = 0;
    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
    virtual void seq_keep(llama_seq_id seq_id) = 0;
-    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) = 0;
+    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
    virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
    virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;
-    //
+    virtual bool get_can_edit() const = 0;
    // state write/read
    //
    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
 };
 using llama_memory_ptr = std::unique_ptr<llama_memory_i>;
 // TODO: temporary until the llama_kv_cache is removed from the public API
 struct llama_kv_cache : public llama_memory_i {
    virtual ~llama_kv_cache() = default;
 };
--- a/examples/talk-llama/llama-mmap.cpp
+++ b/examples/talk-llama/llama-mmap.cpp
@ -401,7 +401,7 @@ struct llama_mmap::impl {
                }
            }
 #else
-            LLAMA_LOG_DEBUG("skipping PrefetchVirtualMemory because _WIN32_WINNT < 0x602\n");
+            throw std::runtime_error("PrefetchVirtualMemory unavailable");
 #endif
        }
    }
--- a/examples/talk-llama/llama-model-loader.cpp
+++ b/examples/talk-llama/llama-model-loader.cpp
@ -288,10 +288,9 @@ namespace GGUFMeta {
    template<typename T>
    bool llama_model_loader::get_arr(const std::string & key, std::vector<T> & result, bool required) {
-        const gguf_context * ctx = meta.get();
+        const int kid = gguf_find_key(meta.get(), key.c_str());
        const int kid = gguf_find_key(ctx, key.c_str());
-        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
            if (required) {
                throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
            }
@ -299,40 +298,28 @@ namespace GGUFMeta {
        }
        struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
        switch (arr_info.gt) {
            case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T,    uint32_t>::value)); break;
+                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
            default:
-                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
        }
-        if constexpr (std::is_same<T, std::string>::value) {
+        result.resize(arr_info.length);
-            const size_t n_items = gguf_get_arr_n(ctx, kid);
+        result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
            result.clear();
            for (size_t i = 0; i < n_items; i++) {
                const T value = gguf_get_arr_str(ctx, kid, i);
                result.emplace_back(value);
            }
        } else {
            result.resize(arr_info.length);
            result.assign((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length);
        }
        return true;
    }
    template<typename T, size_t N_MAX>
    bool llama_model_loader::get_arr(const std::string & key, std::array<T, N_MAX> & result, bool required) {
-        const gguf_context * ctx = meta.get();
+        const int kid = gguf_find_key(meta.get(), key.c_str());
        const int kid = gguf_find_key(ctx, key.c_str());
-        if (kid < 0 || gguf_get_kv_type(ctx, kid) != GGUF_TYPE_ARRAY) {
+        if (kid < 0 || gguf_get_kv_type(meta.get(), kid) != GGUF_TYPE_ARRAY) {
            if (required) {
                throw std::runtime_error(format("array key not found in model: %s", key.c_str()));
            }
@ -340,32 +327,22 @@ namespace GGUFMeta {
        }
        struct GGUFMeta::ArrayInfo arr_info =
-            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx, kid);
+            GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
        switch (arr_info.gt) {
            case GGUF_TYPE_UINT32:
-            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,     int32_t>::value) ||
+            case GGUF_TYPE_INT32:   GGML_ASSERT((std::is_same<T,  int32_t>::value) ||
-                                                (std::is_same<T,    uint32_t>::value)); break;
+                                                (std::is_same<T, uint32_t>::value)); break;
-            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,       float>::value)); break;
+            case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T,    float>::value)); break;
            case GGUF_TYPE_STRING:  GGML_ASSERT((std::is_same<T, std::string>::value)); break;
            default:
-                throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str()));
+                throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
        }
        if (arr_info.length > N_MAX) {
            throw std::runtime_error(format("array length %u for key %s exceeds max %u", (uint32_t) arr_info.length, key.c_str(), (uint32_t) N_MAX));
        }
-        if constexpr (std::is_same<T, std::string>::value) {
+        std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
            const size_t n_items = gguf_get_arr_n(ctx, kid);
            for (size_t i = 0; i < n_items; i++) {
                const T value = gguf_get_arr_str(ctx, kid, i);
                result[i] = value;
            }
        } else {
            std::copy((const T*)arr_info.data, (const T *)arr_info.data + arr_info.length, result.begin());
        }
        return true;
    }
@ -375,8 +352,6 @@ namespace GGUFMeta {
        return get_arr(llm_kv(kid), result, required);
    }
    template bool llama_model_loader::get_arr<std::vector<std::string>>(enum llm_kv kid, std::vector<std::string> & result, bool required);
    template<typename T>
    bool llama_model_loader::get_key(const std::string & key, T & result, bool required) {
        auto it = kv_overrides.find(key);
@ -494,7 +469,7 @@ llama_model_loader::llama_model_loader(
    meta.reset(gguf_init_from_file(fname.c_str(), params));
    if (!meta) {
-        throw std::runtime_error(format("%s: failed to load model from %s", __func__, fname.c_str()));
+        throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
    }
    get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
@ -553,7 +528,7 @@ llama_model_loader::llama_model_loader(
            };
            gguf_context_ptr ctx_gguf { gguf_init_from_file(fname_split, split_params) };
            if (!ctx_gguf) {
-                throw std::runtime_error(format("%s: failed to load GGUF split from %s", __func__, fname_split));
+                throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, fname_split));
            }
            // check idx
@ -847,18 +822,13 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
        mappings.reserve(files.size());
        mmaps_used.reserve(files.size());
        for (const auto & file : files) {
-            bool is_numa = false;
+            auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
-
+            if (!reg) {
-            auto * dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                throw std::runtime_error(format("%s: no CPU backend found", __func__));
            if (dev) {
                auto * reg = ggml_backend_dev_backend_reg(dev);
                auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
                if (is_numa_fn) {
                    is_numa = is_numa_fn();
                }
            }
-            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa);
+            auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
            std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
            mmaps_used.emplace_back(mapping->size(), 0);
            if (mlock_mmaps) {
                std::unique_ptr<llama_mlock> mlock_mmap(new llama_mlock());
--- a/examples/talk-llama/llama-model-saver.cpp
+++ b/examples/talk-llama/llama-model-saver.cpp
@ -228,7 +228,6 @@ void llama_model_saver::add_kv_from_model() {
    // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
    add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
    add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
    add_kv(LLM_KV_TOKENIZER_ADD_SEP,                 vocab.get_add_sep());
    add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
    add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
    add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
--- a/examples/talk-llama/llama-model.cpp
+++ b/examples/talk-llama/llama-model.cpp
--- a/examples/talk-llama/llama-model.h
+++ b/examples/talk-llama/llama-model.h
@ -73,7 +73,6 @@ enum llm_type {
    LLM_TYPE_40B,
    LLM_TYPE_65B,
    LLM_TYPE_70B,
    LLM_TYPE_142B,
    LLM_TYPE_236B,
    LLM_TYPE_290B,
    LLM_TYPE_314B,
@ -330,9 +329,6 @@ struct llama_model {
    llama_hparams hparams = {};
    llama_vocab   vocab;
    // for classifier models
    std::vector<std::string> classifier_labels;
    struct ggml_tensor * tok_embd   = nullptr;
    struct ggml_tensor * type_embd  = nullptr;
    struct ggml_tensor * pos_embd   = nullptr;
@ -402,10 +398,7 @@ struct llama_model {
    const struct ggml_tensor * get_tensor(const char * name) const;
-    float get_rope_freq_base (const llama_cparams & cparams, int il) const;
+    ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
    float get_rope_freq_scale(const llama_cparams & cparams, int il) const;
    ggml_tensor * get_rope_factors(const llama_cparams & cparams, int il) const;
    // note: can mutate `cparams`
    // TODO: move this to new llm_arch_model_i interface
--- a/examples/talk-llama/llama-quant.cpp
+++ b/examples/talk-llama/llama-quant.cpp
@ -14,12 +14,6 @@
 #include <thread>
 #include <unordered_map>
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static void zeros(std::ofstream & file, size_t n) {
    char zero = 0;
    for (size_t i = 0; i < n; ++i) {
@ -54,6 +48,12 @@ struct quantize_state_impl {
        {}
 };
 // changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static void llama_tensor_dequantize_impl(
    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
@ -585,8 +585,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
                gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
-                // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
+                gguf_set_val_i32(ctx_out.get(), o.key, o.val_i64);
                gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)abs(o.val_i64));
            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
                gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
@ -797,19 +796,17 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                // unless the user specifies a type
                if (params->tensor_types) {
                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                    const std::string tensor_name(tensor->name);
                    for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
-                            if  (qtype != new_type) {
+                            if (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
                                new_type = qtype;
                                break; // if two or more types are specified for the tensor, first match wins
                            }
                            new_type = qtype;
                            break;
                        }
                    }
                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
            }
--- a/examples/talk-llama/llama-sampling.cpp
+++ b/examples/talk-llama/llama-sampling.cpp
@ -798,7 +798,7 @@ static void llama_sampler_min_p_apply(struct llama_sampler * smpl, llama_token_d
        }
        // if we have enough values the operation was a success
-        if (!filtered_tokens.empty() && filtered_tokens.size() >= ctx->min_keep) {
+        if (filtered_tokens.size() >= ctx->min_keep) {
            memcpy(cur_p->data, filtered_tokens.data(), filtered_tokens.size()*sizeof(llama_token_data));
            cur_p->size = filtered_tokens.size();
            min_p_applied = true;
@ -909,7 +909,7 @@ static void llama_sampler_typical_apply(struct llama_sampler * smpl, llama_token
        cum_sum += cur_p->data[idx].p;
        // Check if the running sum is greater than typical or if we have kept at least min_keep tokens
-        if (cum_sum > ctx->p && (ctx->min_keep == 0 || i >= ctx->min_keep - 1)) {
+        if (cum_sum > ctx->p && i >= ctx->min_keep - 1) {
            last_idx = i + 1;
            break;
        }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	bff8dc248a	talk-llama : sync llama.cpp ggml-ci	2025-05-13 13:20:19 +03:00
Georgi Gerganov	69753804ed	whisper : update to ggml-backend changes (#0 ) ggml-ci	2025-05-13 13:11:24 +03:00
Georgi Gerganov	89970b9aaa	sync : ggml ggml-ci	2025-05-13 13:10:17 +03:00
Xuan-Son Nguyen	79fb43e252	ggml : add mrope kernel for metal (llama/13457)	2025-05-13 13:10:08 +03:00
Georgi Gerganov	926e06dbfd	metal : optimize MoE for large batches (llama/13388)	2025-05-13 13:09:20 +03:00
lhez	43a59eccf6	opencl: remove unnecessary assert for `add` (llama/13257)	2025-05-13 13:05:33 +03:00
Johannes Gäßler	fe0d52b9a2	llama/ggml: add LLM training support (llama/10544) * llama/ggml: add LLM training support more compact progress bar llama_save_model_to_file llama_opt_param_filter ggml_graph_dup force_grads refactor ggml_opt, fix test-opt * remove logits_all * refactor CUDA implementation for ACC * reset graph at beginning of opt period	2025-05-13 13:05:33 +03:00
Dan Johansson	cb90cb0992	ggml-cpu: Integrate fp32=bf16xbf16 SME KleidiAI kernel (llama/13053) * ggml-cpu: Integrate fp32=bf16xbf16 SME KleidiAI kernel Signed-off-by: Dan Johansson <dan.johansson@arm.com> * * code review fixes Signed-off-by: Dan Johansson <dan.johansson@arm.com> * * adds a comment that clarifies barrier usage Signed-off-by: Dan Johansson <dan.johansson@arm.com> --------- Signed-off-by: Dan Johansson <dan.johansson@arm.com> Co-authored-by: Charles Xu <charles.xu@arm.com>	2025-05-13 13:05:33 +03:00
Johannes Gäßler	8264872b5d	CUDA: fix misaligned synchronization in FA (llama/13469)	2025-05-13 13:05:33 +03:00
Atharva Dubey	882d975729	enable dpcpp nightly builds with libraries (llama/13406)	2025-05-13 13:05:33 +03:00
Johannes Gäßler	c426829771	CUDA: fix crash with partial offloading of MoE (llama/13439)	2025-05-13 13:05:33 +03:00
David Huang	0b1962a181	Add `--no-op-offload` to improve `-ot` pp perf in MoE models like llama4 400B (llama/13386)	2025-05-13 13:05:33 +03:00
Johannes Gäßler	86dece9c7c	CUDA: fix race conditions FlashAttention kernels (llama/13438)	2025-05-13 13:05:32 +03:00
Johannes Gäßler	04445664b4	CUDA: fix FlashAttention on Turing (llama/13415)	2025-05-13 13:05:32 +03:00
Jeff Bolz	22f4997dd8	vulkan: scalar flash attention implementation (llama/13324) * vulkan: scalar flash attention implementation * vulkan: always use fp32 for scalar flash attention * vulkan: use vector loads in scalar flash attention shader * vulkan: remove PV matrix, helps with register usage * vulkan: reduce register usage in scalar FA, but perf may be slightly worse * vulkan: load each Q value once. optimize O reduction. more tuning * vulkan: support q4_0/q8_0 KV in scalar FA * CI: increase timeout to accommodate newly-supported tests * vulkan: for scalar FA, select between 1 and 8 rows * vulkan: avoid using Float16 capability in scalar FA	2025-05-13 13:05:32 +03:00
Alberto Cabrera Pérez	b493e03b90	sycl : implementation of reordered Q4_0 MMVQ for Intel GPUs (llama/12858) * sycl : Implemented reorder Q4_0 mmvq Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> * sycl : Fixed mmvq being called when reorder is disabled * sycl : Improved comments in the quants header Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> * Use static_assert * safe_div -> ceil_div * Clarify qi comment * change the reorder tensor from init to execute OP * dbg * Undo changes to test-backend-ops * Refactor changes on top of q4_0 reorder fix * Missing Reverts * Refactored opt_for_reorder logic to simplify code path * Explicit inlining and unroll * Renamed mul_mat_algo enum for consistency --------- Signed-off-by: Alberto Cabrera <alberto.cabrera@codeplay.com> Co-authored-by: romain.biessy <romain.biessy@codeplay.com>	2025-05-13 13:05:32 +03:00
Johannes Gäßler	aef59f4851	CUDA: FA support for Deepseek (Ampere or newer) (llama/13306) * CUDA: FA support for Deepseek (Ampere or newer) * do loop unrolling via C++ template	2025-05-13 13:05:32 +03:00
Johannes Gäßler	f8c75dc43e	CUDA: fix crash on large batch size for MoE models (llama/13384)	2025-05-13 13:05:32 +03:00
Radoslav Gerganov	00c8056715	rpc : add rpc_msg_set_tensor_hash_req (llama/13353) * rpc : add rpc_msg_set_tensor_hash_req Use a dedicated struct for the request of RPC_CMD_SET_TENSOR_HASH which makes the code cleaner. * fix	2025-05-13 13:05:32 +03:00
Jeff Bolz	19d8d9a928	vulkan: Allow up to 4096 elements for mul_mat_id row_ids (llama/13326) This assert fired running Qwen_Qwen3-30B-A3B-Q2_K.gguf: GGML_ASSERT(nei0 * nei1 <= 3072); The tensor is 8 x 512. Increase this array size to accommodate.	2025-05-13 13:05:32 +03:00
Alberto Cabrera Pérez	0c4a229154	sycl: addressing non-contiguous src1 mul_mats (nc and batched) (llama/13343) * sycl: fixed non-contiguous src1 mul_mats (nc and batched) * Fixed wrong static_cast inside kernel	2025-05-13 13:05:31 +03:00